]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
[YoutubeDL] Add simple tests for format_note (Closes #2825)
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import datetime
8 import errno
9 import io
10 import json
11 import locale
12 import os
13 import platform
14 import re
15 import shutil
16 import subprocess
17 import socket
18 import sys
19 import time
20 import traceback
21
22 if os.name == 'nt':
23 import ctypes
24
25 from .utils import (
26 compat_cookiejar,
27 compat_http_client,
28 compat_str,
29 compat_urllib_error,
30 compat_urllib_request,
31 ContentTooShortError,
32 date_from_str,
33 DateRange,
34 determine_ext,
35 DownloadError,
36 encodeFilename,
37 ExtractorError,
38 format_bytes,
39 formatSeconds,
40 get_term_width,
41 locked_file,
42 make_HTTPS_handler,
43 MaxDownloadsReached,
44 PagedList,
45 PostProcessingError,
46 platform_name,
47 preferredencoding,
48 SameFileError,
49 sanitize_filename,
50 subtitles_filename,
51 takewhile_inclusive,
52 UnavailableVideoError,
53 url_basename,
54 write_json_file,
55 write_string,
56 YoutubeDLHandler,
57 prepend_extension,
58 )
59 from .extractor import get_info_extractor, gen_extractors
60 from .downloader import get_suitable_downloader
61 from .postprocessor import FFmpegMergerPP
62 from .version import __version__
63
64
65 class YoutubeDL(object):
66 """YoutubeDL class.
67
68 YoutubeDL objects are the ones responsible of downloading the
69 actual video file and writing it to disk if the user has requested
70 it, among some other tasks. In most cases there should be one per
71 program. As, given a video URL, the downloader doesn't know how to
72 extract all the needed information, task that InfoExtractors do, it
73 has to pass the URL to one of them.
74
75 For this, YoutubeDL objects have a method that allows
76 InfoExtractors to be registered in a given order. When it is passed
77 a URL, the YoutubeDL object handles it to the first InfoExtractor it
78 finds that reports being able to handle it. The InfoExtractor extracts
79 all the information about the video or videos the URL refers to, and
80 YoutubeDL process the extracted information, possibly using a File
81 Downloader to download the video.
82
83 YoutubeDL objects accept a lot of parameters. In order not to saturate
84 the object constructor with arguments, it receives a dictionary of
85 options instead. These options are available through the params
86 attribute for the InfoExtractors to use. The YoutubeDL also
87 registers itself as the downloader in charge for the InfoExtractors
88 that are added to it, so this is a "mutual registration".
89
90 Available options:
91
92 username: Username for authentication purposes.
93 password: Password for authentication purposes.
94 videopassword: Password for acces a video.
95 usenetrc: Use netrc for authentication instead.
96 verbose: Print additional info to stdout.
97 quiet: Do not print messages to stdout.
98 no_warnings: Do not print out anything for warnings.
99 forceurl: Force printing final URL.
100 forcetitle: Force printing title.
101 forceid: Force printing ID.
102 forcethumbnail: Force printing thumbnail URL.
103 forcedescription: Force printing description.
104 forcefilename: Force printing final filename.
105 forceduration: Force printing duration.
106 forcejson: Force printing info_dict as JSON.
107 simulate: Do not download the video files.
108 format: Video format code.
109 format_limit: Highest quality format to try.
110 outtmpl: Template for output names.
111 restrictfilenames: Do not allow "&" and spaces in file names
112 ignoreerrors: Do not stop on download errors.
113 nooverwrites: Prevent overwriting files.
114 playliststart: Playlist item to start at.
115 playlistend: Playlist item to end at.
116 matchtitle: Download only matching titles.
117 rejecttitle: Reject downloads for matching titles.
118 logger: Log messages to a logging.Logger instance.
119 logtostderr: Log messages to stderr instead of stdout.
120 writedescription: Write the video description to a .description file
121 writeinfojson: Write the video description to a .info.json file
122 writeannotations: Write the video annotations to a .annotations.xml file
123 writethumbnail: Write the thumbnail image to a file
124 writesubtitles: Write the video subtitles to a file
125 writeautomaticsub: Write the automatic subtitles to a file
126 allsubtitles: Downloads all the subtitles of the video
127 (requires writesubtitles or writeautomaticsub)
128 listsubtitles: Lists all available subtitles for the video
129 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
130 subtitleslangs: List of languages of the subtitles to download
131 keepvideo: Keep the video file after post-processing
132 daterange: A DateRange object, download only if the upload_date is in the range.
133 skip_download: Skip the actual download of the video file
134 cachedir: Location of the cache files in the filesystem.
135 None to disable filesystem cache.
136 noplaylist: Download single video instead of a playlist if in doubt.
137 age_limit: An integer representing the user's age in years.
138 Unsuitable videos for the given age are skipped.
139 min_views: An integer representing the minimum view count the video
140 must have in order to not be skipped.
141 Videos without view count information are always
142 downloaded. None for no limit.
143 max_views: An integer representing the maximum view count.
144 Videos that are more popular than that are not
145 downloaded.
146 Videos without view count information are always
147 downloaded. None for no limit.
148 download_archive: File name of a file where all downloads are recorded.
149 Videos already present in the file are not downloaded
150 again.
151 cookiefile: File name where cookies should be read from and dumped to.
152 nocheckcertificate:Do not verify SSL certificates
153 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
154 At the moment, this is only supported by YouTube.
155 proxy: URL of the proxy server to use
156 socket_timeout: Time to wait for unresponsive hosts, in seconds
157 bidi_workaround: Work around buggy terminals without bidirectional text
158 support, using fridibi
159 debug_printtraffic:Print out sent and received HTTP traffic
160 include_ads: Download ads as well
161 default_search: Prepend this string if an input url is not valid.
162 'auto' for elaborate guessing
163 encoding: Use this encoding instead of the system-specified.
164
165 The following parameters are not used by YoutubeDL itself, they are used by
166 the FileDownloader:
167 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
168 noresizebuffer, retries, continuedl, noprogress, consoletitle
169
170 The following options are used by the post processors:
171 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
172 otherwise prefer avconv.
173 """
174
175 params = None
176 _ies = []
177 _pps = []
178 _download_retcode = None
179 _num_downloads = None
180 _screen_file = None
181
182 def __init__(self, params=None):
183 """Create a FileDownloader object with the given options."""
184 if params is None:
185 params = {}
186 self._ies = []
187 self._ies_instances = {}
188 self._pps = []
189 self._progress_hooks = []
190 self._download_retcode = 0
191 self._num_downloads = 0
192 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
193 self._err_file = sys.stderr
194 self.params = params
195
196 if params.get('bidi_workaround', False):
197 try:
198 import pty
199 master, slave = pty.openpty()
200 width = get_term_width()
201 if width is None:
202 width_args = []
203 else:
204 width_args = ['-w', str(width)]
205 sp_kwargs = dict(
206 stdin=subprocess.PIPE,
207 stdout=slave,
208 stderr=self._err_file)
209 try:
210 self._output_process = subprocess.Popen(
211 ['bidiv'] + width_args, **sp_kwargs
212 )
213 except OSError:
214 self._output_process = subprocess.Popen(
215 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
216 self._output_channel = os.fdopen(master, 'rb')
217 except OSError as ose:
218 if ose.errno == 2:
219 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
220 else:
221 raise
222
223 if (sys.version_info >= (3,) and sys.platform != 'win32' and
224 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
225 and not params['restrictfilenames']):
226 # On Python 3, the Unicode filesystem API will throw errors (#1474)
227 self.report_warning(
228 'Assuming --restrict-filenames since file system encoding '
229 'cannot encode all charactes. '
230 'Set the LC_ALL environment variable to fix this.')
231 self.params['restrictfilenames'] = True
232
233 if '%(stitle)s' in self.params.get('outtmpl', ''):
234 self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
235
236 self._setup_opener()
237
238 def add_info_extractor(self, ie):
239 """Add an InfoExtractor object to the end of the list."""
240 self._ies.append(ie)
241 self._ies_instances[ie.ie_key()] = ie
242 ie.set_downloader(self)
243
244 def get_info_extractor(self, ie_key):
245 """
246 Get an instance of an IE with name ie_key, it will try to get one from
247 the _ies list, if there's no instance it will create a new one and add
248 it to the extractor list.
249 """
250 ie = self._ies_instances.get(ie_key)
251 if ie is None:
252 ie = get_info_extractor(ie_key)()
253 self.add_info_extractor(ie)
254 return ie
255
256 def add_default_info_extractors(self):
257 """
258 Add the InfoExtractors returned by gen_extractors to the end of the list
259 """
260 for ie in gen_extractors():
261 self.add_info_extractor(ie)
262
263 def add_post_processor(self, pp):
264 """Add a PostProcessor object to the end of the chain."""
265 self._pps.append(pp)
266 pp.set_downloader(self)
267
268 def add_progress_hook(self, ph):
269 """Add the progress hook (currently only for the file downloader)"""
270 self._progress_hooks.append(ph)
271
272 def _bidi_workaround(self, message):
273 if not hasattr(self, '_output_channel'):
274 return message
275
276 assert hasattr(self, '_output_process')
277 assert type(message) == type('')
278 line_count = message.count('\n') + 1
279 self._output_process.stdin.write((message + '\n').encode('utf-8'))
280 self._output_process.stdin.flush()
281 res = ''.join(self._output_channel.readline().decode('utf-8')
282 for _ in range(line_count))
283 return res[:-len('\n')]
284
285 def to_screen(self, message, skip_eol=False):
286 """Print message to stdout if not in quiet mode."""
287 return self.to_stdout(message, skip_eol, check_quiet=True)
288
289 def _write_string(self, s, out=None):
290 write_string(s, out=out, encoding=self.params.get('encoding'))
291
292 def to_stdout(self, message, skip_eol=False, check_quiet=False):
293 """Print message to stdout if not in quiet mode."""
294 if self.params.get('logger'):
295 self.params['logger'].debug(message)
296 elif not check_quiet or not self.params.get('quiet', False):
297 message = self._bidi_workaround(message)
298 terminator = ['\n', ''][skip_eol]
299 output = message + terminator
300
301 self._write_string(output, self._screen_file)
302
303 def to_stderr(self, message):
304 """Print message to stderr."""
305 assert type(message) == type('')
306 if self.params.get('logger'):
307 self.params['logger'].error(message)
308 else:
309 message = self._bidi_workaround(message)
310 output = message + '\n'
311 self._write_string(output, self._err_file)
312
313 def to_console_title(self, message):
314 if not self.params.get('consoletitle', False):
315 return
316 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
317 # c_wchar_p() might not be necessary if `message` is
318 # already of type unicode()
319 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
320 elif 'TERM' in os.environ:
321 self._write_string('\033]0;%s\007' % message, self._screen_file)
322
323 def save_console_title(self):
324 if not self.params.get('consoletitle', False):
325 return
326 if 'TERM' in os.environ:
327 # Save the title on stack
328 self._write_string('\033[22;0t', self._screen_file)
329
330 def restore_console_title(self):
331 if not self.params.get('consoletitle', False):
332 return
333 if 'TERM' in os.environ:
334 # Restore the title from stack
335 self._write_string('\033[23;0t', self._screen_file)
336
337 def __enter__(self):
338 self.save_console_title()
339 return self
340
341 def __exit__(self, *args):
342 self.restore_console_title()
343
344 if self.params.get('cookiefile') is not None:
345 self.cookiejar.save()
346
347 def trouble(self, message=None, tb=None):
348 """Determine action to take when a download problem appears.
349
350 Depending on if the downloader has been configured to ignore
351 download errors or not, this method may throw an exception or
352 not when errors are found, after printing the message.
353
354 tb, if given, is additional traceback information.
355 """
356 if message is not None:
357 self.to_stderr(message)
358 if self.params.get('verbose'):
359 if tb is None:
360 if sys.exc_info()[0]: # if .trouble has been called from an except block
361 tb = ''
362 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
363 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
364 tb += compat_str(traceback.format_exc())
365 else:
366 tb_data = traceback.format_list(traceback.extract_stack())
367 tb = ''.join(tb_data)
368 self.to_stderr(tb)
369 if not self.params.get('ignoreerrors', False):
370 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
371 exc_info = sys.exc_info()[1].exc_info
372 else:
373 exc_info = sys.exc_info()
374 raise DownloadError(message, exc_info)
375 self._download_retcode = 1
376
377 def report_warning(self, message):
378 '''
379 Print the message to stderr, it will be prefixed with 'WARNING:'
380 If stderr is a tty file the 'WARNING:' will be colored
381 '''
382 if self.params.get('logger') is not None:
383 self.params['logger'].warning(message)
384 else:
385 if self.params.get('no_warnings'):
386 return
387 if self._err_file.isatty() and os.name != 'nt':
388 _msg_header = '\033[0;33mWARNING:\033[0m'
389 else:
390 _msg_header = 'WARNING:'
391 warning_message = '%s %s' % (_msg_header, message)
392 self.to_stderr(warning_message)
393
394 def report_error(self, message, tb=None):
395 '''
396 Do the same as trouble, but prefixes the message with 'ERROR:', colored
397 in red if stderr is a tty file.
398 '''
399 if self._err_file.isatty() and os.name != 'nt':
400 _msg_header = '\033[0;31mERROR:\033[0m'
401 else:
402 _msg_header = 'ERROR:'
403 error_message = '%s %s' % (_msg_header, message)
404 self.trouble(error_message, tb)
405
406 def report_file_already_downloaded(self, file_name):
407 """Report file has already been fully downloaded."""
408 try:
409 self.to_screen('[download] %s has already been downloaded' % file_name)
410 except UnicodeEncodeError:
411 self.to_screen('[download] The file has already been downloaded')
412
413 def prepare_filename(self, info_dict):
414 """Generate the output filename."""
415 try:
416 template_dict = dict(info_dict)
417
418 template_dict['epoch'] = int(time.time())
419 autonumber_size = self.params.get('autonumber_size')
420 if autonumber_size is None:
421 autonumber_size = 5
422 autonumber_templ = '%0' + str(autonumber_size) + 'd'
423 template_dict['autonumber'] = autonumber_templ % self._num_downloads
424 if template_dict.get('playlist_index') is not None:
425 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
426 if template_dict.get('resolution') is None:
427 if template_dict.get('width') and template_dict.get('height'):
428 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
429 elif template_dict.get('height'):
430 template_dict['resolution'] = '%sp' % template_dict['height']
431 elif template_dict.get('width'):
432 template_dict['resolution'] = '?x%d' % template_dict['width']
433
434 sanitize = lambda k, v: sanitize_filename(
435 compat_str(v),
436 restricted=self.params.get('restrictfilenames'),
437 is_id=(k == 'id'))
438 template_dict = dict((k, sanitize(k, v))
439 for k, v in template_dict.items()
440 if v is not None)
441 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
442
443 tmpl = os.path.expanduser(self.params['outtmpl'])
444 filename = tmpl % template_dict
445 return filename
446 except ValueError as err:
447 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
448 return None
449
450 def _match_entry(self, info_dict):
451 """ Returns None iff the file should be downloaded """
452
453 video_title = info_dict.get('title', info_dict.get('id', 'video'))
454 if 'title' in info_dict:
455 # This can happen when we're just evaluating the playlist
456 title = info_dict['title']
457 matchtitle = self.params.get('matchtitle', False)
458 if matchtitle:
459 if not re.search(matchtitle, title, re.IGNORECASE):
460 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
461 rejecttitle = self.params.get('rejecttitle', False)
462 if rejecttitle:
463 if re.search(rejecttitle, title, re.IGNORECASE):
464 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
465 date = info_dict.get('upload_date', None)
466 if date is not None:
467 dateRange = self.params.get('daterange', DateRange())
468 if date not in dateRange:
469 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
470 view_count = info_dict.get('view_count', None)
471 if view_count is not None:
472 min_views = self.params.get('min_views')
473 if min_views is not None and view_count < min_views:
474 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
475 max_views = self.params.get('max_views')
476 if max_views is not None and view_count > max_views:
477 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
478 age_limit = self.params.get('age_limit')
479 if age_limit is not None:
480 if age_limit < info_dict.get('age_limit', 0):
481 return 'Skipping "' + title + '" because it is age restricted'
482 if self.in_download_archive(info_dict):
483 return '%s has already been recorded in archive' % video_title
484 return None
485
486 @staticmethod
487 def add_extra_info(info_dict, extra_info):
488 '''Set the keys from extra_info in info dict if they are missing'''
489 for key, value in extra_info.items():
490 info_dict.setdefault(key, value)
491
492 def extract_info(self, url, download=True, ie_key=None, extra_info={},
493 process=True):
494 '''
495 Returns a list with a dictionary for each video we find.
496 If 'download', also downloads the videos.
497 extra_info is a dict containing the extra values to add to each result
498 '''
499
500 if ie_key:
501 ies = [self.get_info_extractor(ie_key)]
502 else:
503 ies = self._ies
504
505 for ie in ies:
506 if not ie.suitable(url):
507 continue
508
509 if not ie.working():
510 self.report_warning('The program functionality for this site has been marked as broken, '
511 'and will probably not work.')
512
513 try:
514 ie_result = ie.extract(url)
515 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
516 break
517 if isinstance(ie_result, list):
518 # Backwards compatibility: old IE result format
519 ie_result = {
520 '_type': 'compat_list',
521 'entries': ie_result,
522 }
523 self.add_default_extra_info(ie_result, ie, url)
524 if process:
525 return self.process_ie_result(ie_result, download, extra_info)
526 else:
527 return ie_result
528 except ExtractorError as de: # An error we somewhat expected
529 self.report_error(compat_str(de), de.format_traceback())
530 break
531 except MaxDownloadsReached:
532 raise
533 except Exception as e:
534 if self.params.get('ignoreerrors', False):
535 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
536 break
537 else:
538 raise
539 else:
540 self.report_error('no suitable InfoExtractor for URL %s' % url)
541
542 def add_default_extra_info(self, ie_result, ie, url):
543 self.add_extra_info(ie_result, {
544 'extractor': ie.IE_NAME,
545 'webpage_url': url,
546 'webpage_url_basename': url_basename(url),
547 'extractor_key': ie.ie_key(),
548 })
549
550 def process_ie_result(self, ie_result, download=True, extra_info={}):
551 """
552 Take the result of the ie(may be modified) and resolve all unresolved
553 references (URLs, playlist items).
554
555 It will also download the videos if 'download'.
556 Returns the resolved ie_result.
557 """
558
559 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
560 if result_type == 'video':
561 self.add_extra_info(ie_result, extra_info)
562 return self.process_video_result(ie_result, download=download)
563 elif result_type == 'url':
564 # We have to add extra_info to the results because it may be
565 # contained in a playlist
566 return self.extract_info(ie_result['url'],
567 download,
568 ie_key=ie_result.get('ie_key'),
569 extra_info=extra_info)
570 elif result_type == 'url_transparent':
571 # Use the information from the embedding page
572 info = self.extract_info(
573 ie_result['url'], ie_key=ie_result.get('ie_key'),
574 extra_info=extra_info, download=False, process=False)
575
576 def make_result(embedded_info):
577 new_result = ie_result.copy()
578 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
579 'entries', 'ie_key', 'duration',
580 'subtitles', 'annotations', 'format',
581 'thumbnail', 'thumbnails'):
582 if f in new_result:
583 del new_result[f]
584 if f in embedded_info:
585 new_result[f] = embedded_info[f]
586 return new_result
587 new_result = make_result(info)
588
589 assert new_result.get('_type') != 'url_transparent'
590 if new_result.get('_type') == 'compat_list':
591 new_result['entries'] = [
592 make_result(e) for e in new_result['entries']]
593
594 return self.process_ie_result(
595 new_result, download=download, extra_info=extra_info)
596 elif result_type == 'playlist':
597 # We process each entry in the playlist
598 playlist = ie_result.get('title', None) or ie_result.get('id', None)
599 self.to_screen('[download] Downloading playlist: %s' % playlist)
600
601 playlist_results = []
602
603 playliststart = self.params.get('playliststart', 1) - 1
604 playlistend = self.params.get('playlistend', None)
605 # For backwards compatibility, interpret -1 as whole list
606 if playlistend == -1:
607 playlistend = None
608
609 if isinstance(ie_result['entries'], list):
610 n_all_entries = len(ie_result['entries'])
611 entries = ie_result['entries'][playliststart:playlistend]
612 n_entries = len(entries)
613 self.to_screen(
614 "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
615 (ie_result['extractor'], playlist, n_all_entries, n_entries))
616 else:
617 assert isinstance(ie_result['entries'], PagedList)
618 entries = ie_result['entries'].getslice(
619 playliststart, playlistend)
620 n_entries = len(entries)
621 self.to_screen(
622 "[%s] playlist %s: Downloading %d videos" %
623 (ie_result['extractor'], playlist, n_entries))
624
625 for i, entry in enumerate(entries, 1):
626 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
627 extra = {
628 'playlist': playlist,
629 'playlist_index': i + playliststart,
630 'extractor': ie_result['extractor'],
631 'webpage_url': ie_result['webpage_url'],
632 'webpage_url_basename': url_basename(ie_result['webpage_url']),
633 'extractor_key': ie_result['extractor_key'],
634 }
635
636 reason = self._match_entry(entry)
637 if reason is not None:
638 self.to_screen('[download] ' + reason)
639 continue
640
641 entry_result = self.process_ie_result(entry,
642 download=download,
643 extra_info=extra)
644 playlist_results.append(entry_result)
645 ie_result['entries'] = playlist_results
646 return ie_result
647 elif result_type == 'compat_list':
648 def _fixup(r):
649 self.add_extra_info(r,
650 {
651 'extractor': ie_result['extractor'],
652 'webpage_url': ie_result['webpage_url'],
653 'webpage_url_basename': url_basename(ie_result['webpage_url']),
654 'extractor_key': ie_result['extractor_key'],
655 })
656 return r
657 ie_result['entries'] = [
658 self.process_ie_result(_fixup(r), download, extra_info)
659 for r in ie_result['entries']
660 ]
661 return ie_result
662 else:
663 raise Exception('Invalid result type: %s' % result_type)
664
665 def select_format(self, format_spec, available_formats):
666 if format_spec == 'best' or format_spec is None:
667 return available_formats[-1]
668 elif format_spec == 'worst':
669 return available_formats[0]
670 elif format_spec == 'bestaudio':
671 audio_formats = [
672 f for f in available_formats
673 if f.get('vcodec') == 'none']
674 if audio_formats:
675 return audio_formats[-1]
676 elif format_spec == 'worstaudio':
677 audio_formats = [
678 f for f in available_formats
679 if f.get('vcodec') == 'none']
680 if audio_formats:
681 return audio_formats[0]
682 elif format_spec == 'bestvideo':
683 video_formats = [
684 f for f in available_formats
685 if f.get('acodec') == 'none']
686 if video_formats:
687 return video_formats[-1]
688 elif format_spec == 'worstvideo':
689 video_formats = [
690 f for f in available_formats
691 if f.get('acodec') == 'none']
692 if video_formats:
693 return video_formats[0]
694 else:
695 extensions = ['mp4', 'flv', 'webm', '3gp']
696 if format_spec in extensions:
697 filter_f = lambda f: f['ext'] == format_spec
698 else:
699 filter_f = lambda f: f['format_id'] == format_spec
700 matches = list(filter(filter_f, available_formats))
701 if matches:
702 return matches[-1]
703 return None
704
705 def process_video_result(self, info_dict, download=True):
706 assert info_dict.get('_type', 'video') == 'video'
707
708 if 'id' not in info_dict:
709 raise ExtractorError('Missing "id" field in extractor result')
710 if 'title' not in info_dict:
711 raise ExtractorError('Missing "title" field in extractor result')
712
713 if 'playlist' not in info_dict:
714 # It isn't part of a playlist
715 info_dict['playlist'] = None
716 info_dict['playlist_index'] = None
717
718 if 'display_id' not in info_dict and 'id' in info_dict:
719 info_dict['display_id'] = info_dict['id']
720
721 if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
722 upload_date = datetime.datetime.utcfromtimestamp(
723 info_dict['timestamp'])
724 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
725
726 # This extractors handle format selection themselves
727 if info_dict['extractor'] in ['Youku']:
728 if download:
729 self.process_info(info_dict)
730 return info_dict
731
732 # We now pick which formats have to be downloaded
733 if info_dict.get('formats') is None:
734 # There's only one format available
735 formats = [info_dict]
736 else:
737 formats = info_dict['formats']
738
739 if not formats:
740 raise ExtractorError('No video formats found!')
741
742 # We check that all the formats have the format and format_id fields
743 for i, format in enumerate(formats):
744 if 'url' not in format:
745 raise ExtractorError('Missing "url" key in result (index %d)' % i)
746
747 if format.get('format_id') is None:
748 format['format_id'] = compat_str(i)
749 if format.get('format') is None:
750 format['format'] = '{id} - {res}{note}'.format(
751 id=format['format_id'],
752 res=self.format_resolution(format),
753 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
754 )
755 # Automatically determine file extension if missing
756 if 'ext' not in format:
757 format['ext'] = determine_ext(format['url']).lower()
758
759 format_limit = self.params.get('format_limit', None)
760 if format_limit:
761 formats = list(takewhile_inclusive(
762 lambda f: f['format_id'] != format_limit, formats
763 ))
764
765 # TODO Central sorting goes here
766
767 if formats[0] is not info_dict:
768 # only set the 'formats' fields if the original info_dict list them
769 # otherwise we end up with a circular reference, the first (and unique)
770 # element in the 'formats' field in info_dict is info_dict itself,
771 # wich can't be exported to json
772 info_dict['formats'] = formats
773 if self.params.get('listformats', None):
774 self.list_formats(info_dict)
775 return
776
777 req_format = self.params.get('format')
778 if req_format is None:
779 req_format = 'best'
780 formats_to_download = []
781 # The -1 is for supporting YoutubeIE
782 if req_format in ('-1', 'all'):
783 formats_to_download = formats
784 else:
785 # We can accept formats requested in the format: 34/5/best, we pick
786 # the first that is available, starting from left
787 req_formats = req_format.split('/')
788 for rf in req_formats:
789 if re.match(r'.+?\+.+?', rf) is not None:
790 # Two formats have been requested like '137+139'
791 format_1, format_2 = rf.split('+')
792 formats_info = (self.select_format(format_1, formats),
793 self.select_format(format_2, formats))
794 if all(formats_info):
795 selected_format = {
796 'requested_formats': formats_info,
797 'format': rf,
798 'ext': formats_info[0]['ext'],
799 }
800 else:
801 selected_format = None
802 else:
803 selected_format = self.select_format(rf, formats)
804 if selected_format is not None:
805 formats_to_download = [selected_format]
806 break
807 if not formats_to_download:
808 raise ExtractorError('requested format not available',
809 expected=True)
810
811 if download:
812 if len(formats_to_download) > 1:
813 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
814 for format in formats_to_download:
815 new_info = dict(info_dict)
816 new_info.update(format)
817 self.process_info(new_info)
818 # We update the info dict with the best quality format (backwards compatibility)
819 info_dict.update(formats_to_download[-1])
820 return info_dict
821
822 def process_info(self, info_dict):
823 """Process a single resolved IE result."""
824
825 assert info_dict.get('_type', 'video') == 'video'
826
827 max_downloads = self.params.get('max_downloads')
828 if max_downloads is not None:
829 if self._num_downloads >= int(max_downloads):
830 raise MaxDownloadsReached()
831
832 info_dict['fulltitle'] = info_dict['title']
833 if len(info_dict['title']) > 200:
834 info_dict['title'] = info_dict['title'][:197] + '...'
835
836 # Keep for backwards compatibility
837 info_dict['stitle'] = info_dict['title']
838
839 if not 'format' in info_dict:
840 info_dict['format'] = info_dict['ext']
841
842 reason = self._match_entry(info_dict)
843 if reason is not None:
844 self.to_screen('[download] ' + reason)
845 return
846
847 self._num_downloads += 1
848
849 filename = self.prepare_filename(info_dict)
850
851 # Forced printings
852 if self.params.get('forcetitle', False):
853 self.to_stdout(info_dict['fulltitle'])
854 if self.params.get('forceid', False):
855 self.to_stdout(info_dict['id'])
856 if self.params.get('forceurl', False):
857 # For RTMP URLs, also include the playpath
858 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
859 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
860 self.to_stdout(info_dict['thumbnail'])
861 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
862 self.to_stdout(info_dict['description'])
863 if self.params.get('forcefilename', False) and filename is not None:
864 self.to_stdout(filename)
865 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
866 self.to_stdout(formatSeconds(info_dict['duration']))
867 if self.params.get('forceformat', False):
868 self.to_stdout(info_dict['format'])
869 if self.params.get('forcejson', False):
870 info_dict['_filename'] = filename
871 self.to_stdout(json.dumps(info_dict))
872
873 # Do nothing else if in simulate mode
874 if self.params.get('simulate', False):
875 return
876
877 if filename is None:
878 return
879
880 try:
881 dn = os.path.dirname(encodeFilename(filename))
882 if dn and not os.path.exists(dn):
883 os.makedirs(dn)
884 except (OSError, IOError) as err:
885 self.report_error('unable to create directory ' + compat_str(err))
886 return
887
888 if self.params.get('writedescription', False):
889 descfn = filename + '.description'
890 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
891 self.to_screen('[info] Video description is already present')
892 else:
893 try:
894 self.to_screen('[info] Writing video description to: ' + descfn)
895 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
896 descfile.write(info_dict['description'])
897 except (KeyError, TypeError):
898 self.report_warning('There\'s no description to write.')
899 except (OSError, IOError):
900 self.report_error('Cannot write description file ' + descfn)
901 return
902
903 if self.params.get('writeannotations', False):
904 annofn = filename + '.annotations.xml'
905 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
906 self.to_screen('[info] Video annotations are already present')
907 else:
908 try:
909 self.to_screen('[info] Writing video annotations to: ' + annofn)
910 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
911 annofile.write(info_dict['annotations'])
912 except (KeyError, TypeError):
913 self.report_warning('There are no annotations to write.')
914 except (OSError, IOError):
915 self.report_error('Cannot write annotations file: ' + annofn)
916 return
917
918 subtitles_are_requested = any([self.params.get('writesubtitles', False),
919 self.params.get('writeautomaticsub')])
920
921 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
922 # subtitles download errors are already managed as troubles in relevant IE
923 # that way it will silently go on when used with unsupporting IE
924 subtitles = info_dict['subtitles']
925 sub_format = self.params.get('subtitlesformat', 'srt')
926 for sub_lang in subtitles.keys():
927 sub = subtitles[sub_lang]
928 if sub is None:
929 continue
930 try:
931 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
932 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
933 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
934 else:
935 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
936 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
937 subfile.write(sub)
938 except (OSError, IOError):
939 self.report_error('Cannot write subtitles file ' + sub_filename)
940 return
941
942 if self.params.get('writeinfojson', False):
943 infofn = os.path.splitext(filename)[0] + '.info.json'
944 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
945 self.to_screen('[info] Video description metadata is already present')
946 else:
947 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
948 try:
949 write_json_file(info_dict, encodeFilename(infofn))
950 except (OSError, IOError):
951 self.report_error('Cannot write metadata to JSON file ' + infofn)
952 return
953
954 if self.params.get('writethumbnail', False):
955 if info_dict.get('thumbnail') is not None:
956 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
957 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
958 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
959 self.to_screen('[%s] %s: Thumbnail is already present' %
960 (info_dict['extractor'], info_dict['id']))
961 else:
962 self.to_screen('[%s] %s: Downloading thumbnail ...' %
963 (info_dict['extractor'], info_dict['id']))
964 try:
965 uf = self.urlopen(info_dict['thumbnail'])
966 with open(thumb_filename, 'wb') as thumbf:
967 shutil.copyfileobj(uf, thumbf)
968 self.to_screen('[%s] %s: Writing thumbnail to: %s' %
969 (info_dict['extractor'], info_dict['id'], thumb_filename))
970 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
971 self.report_warning('Unable to download thumbnail "%s": %s' %
972 (info_dict['thumbnail'], compat_str(err)))
973
974 if not self.params.get('skip_download', False):
975 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
976 success = True
977 else:
978 try:
979 def dl(name, info):
980 fd = get_suitable_downloader(info)(self, self.params)
981 for ph in self._progress_hooks:
982 fd.add_progress_hook(ph)
983 return fd.download(name, info)
984 if info_dict.get('requested_formats') is not None:
985 downloaded = []
986 success = True
987 merger = FFmpegMergerPP(self)
988 if not merger._get_executable():
989 postprocessors = []
990 self.report_warning('You have requested multiple '
991 'formats but ffmpeg or avconv are not installed.'
992 ' The formats won\'t be merged')
993 else:
994 postprocessors = [merger]
995 for f in info_dict['requested_formats']:
996 new_info = dict(info_dict)
997 new_info.update(f)
998 fname = self.prepare_filename(new_info)
999 fname = prepend_extension(fname, 'f%s' % f['format_id'])
1000 downloaded.append(fname)
1001 partial_success = dl(fname, new_info)
1002 success = success and partial_success
1003 info_dict['__postprocessors'] = postprocessors
1004 info_dict['__files_to_merge'] = downloaded
1005 else:
1006 # Just a single file
1007 success = dl(filename, info_dict)
1008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009 self.report_error('unable to download video data: %s' % str(err))
1010 return
1011 except (OSError, IOError) as err:
1012 raise UnavailableVideoError(err)
1013 except (ContentTooShortError, ) as err:
1014 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1015 return
1016
1017 if success:
1018 try:
1019 self.post_process(filename, info_dict)
1020 except (PostProcessingError) as err:
1021 self.report_error('postprocessing: %s' % str(err))
1022 return
1023
1024 self.record_download_archive(info_dict)
1025
1026 def download(self, url_list):
1027 """Download a given list of URLs."""
1028 if (len(url_list) > 1 and
1029 '%' not in self.params['outtmpl']
1030 and self.params.get('max_downloads') != 1):
1031 raise SameFileError(self.params['outtmpl'])
1032
1033 for url in url_list:
1034 try:
1035 #It also downloads the videos
1036 self.extract_info(url)
1037 except UnavailableVideoError:
1038 self.report_error('unable to download video')
1039 except MaxDownloadsReached:
1040 self.to_screen('[info] Maximum number of downloaded files reached.')
1041 raise
1042
1043 return self._download_retcode
1044
1045 def download_with_info_file(self, info_filename):
1046 with io.open(info_filename, 'r', encoding='utf-8') as f:
1047 info = json.load(f)
1048 try:
1049 self.process_ie_result(info, download=True)
1050 except DownloadError:
1051 webpage_url = info.get('webpage_url')
1052 if webpage_url is not None:
1053 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1054 return self.download([webpage_url])
1055 else:
1056 raise
1057 return self._download_retcode
1058
1059 def post_process(self, filename, ie_info):
1060 """Run all the postprocessors on the given file."""
1061 info = dict(ie_info)
1062 info['filepath'] = filename
1063 keep_video = None
1064 pps_chain = []
1065 if ie_info.get('__postprocessors') is not None:
1066 pps_chain.extend(ie_info['__postprocessors'])
1067 pps_chain.extend(self._pps)
1068 for pp in pps_chain:
1069 try:
1070 keep_video_wish, new_info = pp.run(info)
1071 if keep_video_wish is not None:
1072 if keep_video_wish:
1073 keep_video = keep_video_wish
1074 elif keep_video is None:
1075 # No clear decision yet, let IE decide
1076 keep_video = keep_video_wish
1077 except PostProcessingError as e:
1078 self.report_error(e.msg)
1079 if keep_video is False and not self.params.get('keepvideo', False):
1080 try:
1081 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1082 os.remove(encodeFilename(filename))
1083 except (IOError, OSError):
1084 self.report_warning('Unable to remove downloaded video file')
1085
1086 def _make_archive_id(self, info_dict):
1087 # Future-proof against any change in case
1088 # and backwards compatibility with prior versions
1089 extractor = info_dict.get('extractor_key')
1090 if extractor is None:
1091 if 'id' in info_dict:
1092 extractor = info_dict.get('ie_key') # key in a playlist
1093 if extractor is None:
1094 return None # Incomplete video information
1095 return extractor.lower() + ' ' + info_dict['id']
1096
1097 def in_download_archive(self, info_dict):
1098 fn = self.params.get('download_archive')
1099 if fn is None:
1100 return False
1101
1102 vid_id = self._make_archive_id(info_dict)
1103 if vid_id is None:
1104 return False # Incomplete video information
1105
1106 try:
1107 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1108 for line in archive_file:
1109 if line.strip() == vid_id:
1110 return True
1111 except IOError as ioe:
1112 if ioe.errno != errno.ENOENT:
1113 raise
1114 return False
1115
1116 def record_download_archive(self, info_dict):
1117 fn = self.params.get('download_archive')
1118 if fn is None:
1119 return
1120 vid_id = self._make_archive_id(info_dict)
1121 assert vid_id
1122 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1123 archive_file.write(vid_id + '\n')
1124
1125 @staticmethod
1126 def format_resolution(format, default='unknown'):
1127 if format.get('vcodec') == 'none':
1128 return 'audio only'
1129 if format.get('resolution') is not None:
1130 return format['resolution']
1131 if format.get('height') is not None:
1132 if format.get('width') is not None:
1133 res = '%sx%s' % (format['width'], format['height'])
1134 else:
1135 res = '%sp' % format['height']
1136 elif format.get('width') is not None:
1137 res = '?x%d' % format['width']
1138 else:
1139 res = default
1140 return res
1141
1142 def _format_note(self, fdict):
1143 res = ''
1144 if fdict.get('ext') in ['f4f', 'f4m']:
1145 res += '(unsupported) '
1146 if fdict.get('format_note') is not None:
1147 res += fdict['format_note'] + ' '
1148 if fdict.get('tbr') is not None:
1149 res += '%4dk ' % fdict['tbr']
1150 if fdict.get('container') is not None:
1151 if res:
1152 res += ', '
1153 res += '%s container' % fdict['container']
1154 if (fdict.get('vcodec') is not None and
1155 fdict.get('vcodec') != 'none'):
1156 if res:
1157 res += ', '
1158 res += fdict['vcodec']
1159 if fdict.get('vbr') is not None:
1160 res += '@'
1161 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1162 res += 'video@'
1163 if fdict.get('vbr') is not None:
1164 res += '%4dk' % fdict['vbr']
1165 if fdict.get('acodec') is not None:
1166 if res:
1167 res += ', '
1168 if fdict['acodec'] == 'none':
1169 res += 'video only'
1170 else:
1171 res += '%-5s' % fdict['acodec']
1172 elif fdict.get('abr') is not None:
1173 if res:
1174 res += ', '
1175 res += 'audio'
1176 if fdict.get('abr') is not None:
1177 res += '@%3dk' % fdict['abr']
1178 if fdict.get('asr') is not None:
1179 res += ' (%5dHz)' % fdict['asr']
1180 if fdict.get('filesize') is not None:
1181 if res:
1182 res += ', '
1183 res += format_bytes(fdict['filesize'])
1184 return res
1185
1186 def list_formats(self, info_dict):
1187 def line(format, idlen=20):
1188 return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1189 format['format_id'],
1190 format['ext'],
1191 self.format_resolution(format),
1192 self._format_note(format),
1193 ))
1194
1195 formats = info_dict.get('formats', [info_dict])
1196 idlen = max(len('format code'),
1197 max(len(f['format_id']) for f in formats))
1198 formats_s = [line(f, idlen) for f in formats]
1199 if len(formats) > 1:
1200 formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
1201 formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
1202
1203 header_line = line({
1204 'format_id': 'format code', 'ext': 'extension',
1205 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1206 self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1207 (info_dict['id'], header_line, '\n'.join(formats_s)))
1208
1209 def urlopen(self, req):
1210 """ Start an HTTP download """
1211 return self._opener.open(req, timeout=self._socket_timeout)
1212
1213 def print_debug_header(self):
1214 if not self.params.get('verbose'):
1215 return
1216
1217 write_string(
1218 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1219 locale.getpreferredencoding(),
1220 sys.getfilesystemencoding(),
1221 sys.stdout.encoding,
1222 self.get_encoding()),
1223 encoding=None
1224 )
1225
1226 self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1227 try:
1228 sp = subprocess.Popen(
1229 ['git', 'rev-parse', '--short', 'HEAD'],
1230 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1231 cwd=os.path.dirname(os.path.abspath(__file__)))
1232 out, err = sp.communicate()
1233 out = out.decode().strip()
1234 if re.match('[0-9a-f]+', out):
1235 self._write_string('[debug] Git HEAD: ' + out + '\n')
1236 except:
1237 try:
1238 sys.exc_clear()
1239 except:
1240 pass
1241 self._write_string('[debug] Python version %s - %s' %
1242 (platform.python_version(), platform_name()) + '\n')
1243
1244 proxy_map = {}
1245 for handler in self._opener.handlers:
1246 if hasattr(handler, 'proxies'):
1247 proxy_map.update(handler.proxies)
1248 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1249
1250 def _setup_opener(self):
1251 timeout_val = self.params.get('socket_timeout')
1252 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1253
1254 opts_cookiefile = self.params.get('cookiefile')
1255 opts_proxy = self.params.get('proxy')
1256
1257 if opts_cookiefile is None:
1258 self.cookiejar = compat_cookiejar.CookieJar()
1259 else:
1260 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1261 opts_cookiefile)
1262 if os.access(opts_cookiefile, os.R_OK):
1263 self.cookiejar.load()
1264
1265 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1266 self.cookiejar)
1267 if opts_proxy is not None:
1268 if opts_proxy == '':
1269 proxies = {}
1270 else:
1271 proxies = {'http': opts_proxy, 'https': opts_proxy}
1272 else:
1273 proxies = compat_urllib_request.getproxies()
1274 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1275 if 'http' in proxies and 'https' not in proxies:
1276 proxies['https'] = proxies['http']
1277 proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1278
1279 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1280 https_handler = make_HTTPS_handler(
1281 self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1282 ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1283 opener = compat_urllib_request.build_opener(
1284 https_handler, proxy_handler, cookie_processor, ydlh)
1285 # Delete the default user-agent header, which would otherwise apply in
1286 # cases where our custom HTTP handler doesn't come into play
1287 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1288 opener.addheaders = []
1289 self._opener = opener
1290
1291 def encode(self, s):
1292 if isinstance(s, bytes):
1293 return s # Already encoded
1294
1295 try:
1296 return s.encode(self.get_encoding())
1297 except UnicodeEncodeError as err:
1298 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
1299 raise
1300
1301 def get_encoding(self):
1302 encoding = self.params.get('encoding')
1303 if encoding is None:
1304 encoding = preferredencoding()
1305 return encoding