]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
YoutubeDL: only set the ‘formats’ field of the info_dict if it was already set before
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import collections
7 import errno
8 import io
9 import json
10 import os
11 import platform
12 import re
13 import shutil
14 import subprocess
15 import socket
16 import sys
17 import time
18 import traceback
19
20 if os.name == 'nt':
21 import ctypes
22
23 from .utils import (
24 compat_cookiejar,
25 compat_http_client,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_request,
29 ContentTooShortError,
30 date_from_str,
31 DateRange,
32 determine_ext,
33 DownloadError,
34 encodeFilename,
35 ExtractorError,
36 format_bytes,
37 formatSeconds,
38 get_term_width,
39 locked_file,
40 make_HTTPS_handler,
41 MaxDownloadsReached,
42 PostProcessingError,
43 platform_name,
44 preferredencoding,
45 SameFileError,
46 sanitize_filename,
47 subtitles_filename,
48 takewhile_inclusive,
49 UnavailableVideoError,
50 url_basename,
51 write_json_file,
52 write_string,
53 YoutubeDLHandler,
54 )
55 from .extractor import get_info_extractor, gen_extractors
56 from .downloader import get_suitable_downloader
57 from .version import __version__
58
59
60 class YoutubeDL(object):
61 """YoutubeDL class.
62
63 YoutubeDL objects are the ones responsible of downloading the
64 actual video file and writing it to disk if the user has requested
65 it, among some other tasks. In most cases there should be one per
66 program. As, given a video URL, the downloader doesn't know how to
67 extract all the needed information, task that InfoExtractors do, it
68 has to pass the URL to one of them.
69
70 For this, YoutubeDL objects have a method that allows
71 InfoExtractors to be registered in a given order. When it is passed
72 a URL, the YoutubeDL object handles it to the first InfoExtractor it
73 finds that reports being able to handle it. The InfoExtractor extracts
74 all the information about the video or videos the URL refers to, and
75 YoutubeDL process the extracted information, possibly using a File
76 Downloader to download the video.
77
78 YoutubeDL objects accept a lot of parameters. In order not to saturate
79 the object constructor with arguments, it receives a dictionary of
80 options instead. These options are available through the params
81 attribute for the InfoExtractors to use. The YoutubeDL also
82 registers itself as the downloader in charge for the InfoExtractors
83 that are added to it, so this is a "mutual registration".
84
85 Available options:
86
87 username: Username for authentication purposes.
88 password: Password for authentication purposes.
89 videopassword: Password for acces a video.
90 usenetrc: Use netrc for authentication instead.
91 verbose: Print additional info to stdout.
92 quiet: Do not print messages to stdout.
93 forceurl: Force printing final URL.
94 forcetitle: Force printing title.
95 forceid: Force printing ID.
96 forcethumbnail: Force printing thumbnail URL.
97 forcedescription: Force printing description.
98 forcefilename: Force printing final filename.
99 forceduration: Force printing duration.
100 forcejson: Force printing info_dict as JSON.
101 simulate: Do not download the video files.
102 format: Video format code.
103 format_limit: Highest quality format to try.
104 outtmpl: Template for output names.
105 restrictfilenames: Do not allow "&" and spaces in file names
106 ignoreerrors: Do not stop on download errors.
107 nooverwrites: Prevent overwriting files.
108 playliststart: Playlist item to start at.
109 playlistend: Playlist item to end at.
110 matchtitle: Download only matching titles.
111 rejecttitle: Reject downloads for matching titles.
112 logger: Log messages to a logging.Logger instance.
113 logtostderr: Log messages to stderr instead of stdout.
114 writedescription: Write the video description to a .description file
115 writeinfojson: Write the video description to a .info.json file
116 writeannotations: Write the video annotations to a .annotations.xml file
117 writethumbnail: Write the thumbnail image to a file
118 writesubtitles: Write the video subtitles to a file
119 writeautomaticsub: Write the automatic subtitles to a file
120 allsubtitles: Downloads all the subtitles of the video
121 (requires writesubtitles or writeautomaticsub)
122 listsubtitles: Lists all available subtitles for the video
123 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
124 subtitleslangs: List of languages of the subtitles to download
125 keepvideo: Keep the video file after post-processing
126 daterange: A DateRange object, download only if the upload_date is in the range.
127 skip_download: Skip the actual download of the video file
128 cachedir: Location of the cache files in the filesystem.
129 None to disable filesystem cache.
130 noplaylist: Download single video instead of a playlist if in doubt.
131 age_limit: An integer representing the user's age in years.
132 Unsuitable videos for the given age are skipped.
133 min_views: An integer representing the minimum view count the video
134 must have in order to not be skipped.
135 Videos without view count information are always
136 downloaded. None for no limit.
137 max_views: An integer representing the maximum view count.
138 Videos that are more popular than that are not
139 downloaded.
140 Videos without view count information are always
141 downloaded. None for no limit.
142 download_archive: File name of a file where all downloads are recorded.
143 Videos already present in the file are not downloaded
144 again.
145 cookiefile: File name where cookies should be read from and dumped to.
146 nocheckcertificate:Do not verify SSL certificates
147 proxy: URL of the proxy server to use
148 socket_timeout: Time to wait for unresponsive hosts, in seconds
149 bidi_workaround: Work around buggy terminals without bidirectional text
150 support, using fridibi
151
152 The following parameters are not used by YoutubeDL itself, they are used by
153 the FileDownloader:
154 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
155 noresizebuffer, retries, continuedl, noprogress, consoletitle
156 """
157
158 params = None
159 _ies = []
160 _pps = []
161 _download_retcode = None
162 _num_downloads = None
163 _screen_file = None
164
165 def __init__(self, params=None):
166 """Create a FileDownloader object with the given options."""
167 self._ies = []
168 self._ies_instances = {}
169 self._pps = []
170 self._fd_progress_hooks = []
171 self._download_retcode = 0
172 self._num_downloads = 0
173 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
174 self._err_file = sys.stderr
175 self.params = {} if params is None else params
176
177 if params.get('bidi_workaround', False):
178 try:
179 import pty
180 master, slave = pty.openpty()
181 width = get_term_width()
182 if width is None:
183 width_args = []
184 else:
185 width_args = ['-w', str(width)]
186 sp_kwargs = dict(
187 stdin=subprocess.PIPE,
188 stdout=slave,
189 stderr=self._err_file)
190 try:
191 self._output_process = subprocess.Popen(
192 ['bidiv'] + width_args, **sp_kwargs
193 )
194 except OSError:
195 self._output_process = subprocess.Popen(
196 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
197 self._output_channel = os.fdopen(master, 'rb')
198 except OSError as ose:
199 if ose.errno == 2:
200 self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
201 else:
202 raise
203
204 if (sys.version_info >= (3,) and sys.platform != 'win32' and
205 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
206 and not params['restrictfilenames']):
207 # On Python 3, the Unicode filesystem API will throw errors (#1474)
208 self.report_warning(
209 u'Assuming --restrict-filenames since file system encoding '
210 u'cannot encode all charactes. '
211 u'Set the LC_ALL environment variable to fix this.')
212 self.params['restrictfilenames'] = True
213
214 if '%(stitle)s' in self.params.get('outtmpl', ''):
215 self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
216
217 self._setup_opener()
218
219 def add_info_extractor(self, ie):
220 """Add an InfoExtractor object to the end of the list."""
221 self._ies.append(ie)
222 self._ies_instances[ie.ie_key()] = ie
223 ie.set_downloader(self)
224
225 def get_info_extractor(self, ie_key):
226 """
227 Get an instance of an IE with name ie_key, it will try to get one from
228 the _ies list, if there's no instance it will create a new one and add
229 it to the extractor list.
230 """
231 ie = self._ies_instances.get(ie_key)
232 if ie is None:
233 ie = get_info_extractor(ie_key)()
234 self.add_info_extractor(ie)
235 return ie
236
237 def add_default_info_extractors(self):
238 """
239 Add the InfoExtractors returned by gen_extractors to the end of the list
240 """
241 for ie in gen_extractors():
242 self.add_info_extractor(ie)
243
244 def add_post_processor(self, pp):
245 """Add a PostProcessor object to the end of the chain."""
246 self._pps.append(pp)
247 pp.set_downloader(self)
248
249 def add_downloader_progress_hook(self, ph):
250 """Add the progress hook to the file downloader"""
251 self._fd_progress_hooks.append(ph)
252
253 def _bidi_workaround(self, message):
254 if not hasattr(self, '_output_channel'):
255 return message
256
257 assert hasattr(self, '_output_process')
258 assert type(message) == type(u'')
259 line_count = message.count(u'\n') + 1
260 self._output_process.stdin.write((message + u'\n').encode('utf-8'))
261 self._output_process.stdin.flush()
262 res = u''.join(self._output_channel.readline().decode('utf-8')
263 for _ in range(line_count))
264 return res[:-len(u'\n')]
265
266 def to_screen(self, message, skip_eol=False):
267 """Print message to stdout if not in quiet mode."""
268 return self.to_stdout(message, skip_eol, check_quiet=True)
269
270 def to_stdout(self, message, skip_eol=False, check_quiet=False):
271 """Print message to stdout if not in quiet mode."""
272 if self.params.get('logger'):
273 self.params['logger'].debug(message)
274 elif not check_quiet or not self.params.get('quiet', False):
275 message = self._bidi_workaround(message)
276 terminator = [u'\n', u''][skip_eol]
277 output = message + terminator
278
279 write_string(output, self._screen_file)
280
281 def to_stderr(self, message):
282 """Print message to stderr."""
283 assert type(message) == type(u'')
284 if self.params.get('logger'):
285 self.params['logger'].error(message)
286 else:
287 message = self._bidi_workaround(message)
288 output = message + u'\n'
289 write_string(output, self._err_file)
290
291 def to_console_title(self, message):
292 if not self.params.get('consoletitle', False):
293 return
294 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
295 # c_wchar_p() might not be necessary if `message` is
296 # already of type unicode()
297 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
298 elif 'TERM' in os.environ:
299 write_string(u'\033]0;%s\007' % message, self._screen_file)
300
301 def save_console_title(self):
302 if not self.params.get('consoletitle', False):
303 return
304 if 'TERM' in os.environ:
305 # Save the title on stack
306 write_string(u'\033[22;0t', self._screen_file)
307
308 def restore_console_title(self):
309 if not self.params.get('consoletitle', False):
310 return
311 if 'TERM' in os.environ:
312 # Restore the title from stack
313 write_string(u'\033[23;0t', self._screen_file)
314
315 def __enter__(self):
316 self.save_console_title()
317 return self
318
319 def __exit__(self, *args):
320 self.restore_console_title()
321
322 if self.params.get('cookiefile') is not None:
323 self.cookiejar.save()
324
325 def trouble(self, message=None, tb=None):
326 """Determine action to take when a download problem appears.
327
328 Depending on if the downloader has been configured to ignore
329 download errors or not, this method may throw an exception or
330 not when errors are found, after printing the message.
331
332 tb, if given, is additional traceback information.
333 """
334 if message is not None:
335 self.to_stderr(message)
336 if self.params.get('verbose'):
337 if tb is None:
338 if sys.exc_info()[0]: # if .trouble has been called from an except block
339 tb = u''
340 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
341 tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
342 tb += compat_str(traceback.format_exc())
343 else:
344 tb_data = traceback.format_list(traceback.extract_stack())
345 tb = u''.join(tb_data)
346 self.to_stderr(tb)
347 if not self.params.get('ignoreerrors', False):
348 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
349 exc_info = sys.exc_info()[1].exc_info
350 else:
351 exc_info = sys.exc_info()
352 raise DownloadError(message, exc_info)
353 self._download_retcode = 1
354
355 def report_warning(self, message):
356 '''
357 Print the message to stderr, it will be prefixed with 'WARNING:'
358 If stderr is a tty file the 'WARNING:' will be colored
359 '''
360 if self._err_file.isatty() and os.name != 'nt':
361 _msg_header = u'\033[0;33mWARNING:\033[0m'
362 else:
363 _msg_header = u'WARNING:'
364 warning_message = u'%s %s' % (_msg_header, message)
365 self.to_stderr(warning_message)
366
367 def report_error(self, message, tb=None):
368 '''
369 Do the same as trouble, but prefixes the message with 'ERROR:', colored
370 in red if stderr is a tty file.
371 '''
372 if self._err_file.isatty() and os.name != 'nt':
373 _msg_header = u'\033[0;31mERROR:\033[0m'
374 else:
375 _msg_header = u'ERROR:'
376 error_message = u'%s %s' % (_msg_header, message)
377 self.trouble(error_message, tb)
378
379 def report_file_already_downloaded(self, file_name):
380 """Report file has already been fully downloaded."""
381 try:
382 self.to_screen(u'[download] %s has already been downloaded' % file_name)
383 except UnicodeEncodeError:
384 self.to_screen(u'[download] The file has already been downloaded')
385
386 def increment_downloads(self):
387 """Increment the ordinal that assigns a number to each file."""
388 self._num_downloads += 1
389
390 def prepare_filename(self, info_dict):
391 """Generate the output filename."""
392 try:
393 template_dict = dict(info_dict)
394
395 template_dict['epoch'] = int(time.time())
396 autonumber_size = self.params.get('autonumber_size')
397 if autonumber_size is None:
398 autonumber_size = 5
399 autonumber_templ = u'%0' + str(autonumber_size) + u'd'
400 template_dict['autonumber'] = autonumber_templ % self._num_downloads
401 if template_dict.get('playlist_index') is not None:
402 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
403
404 sanitize = lambda k, v: sanitize_filename(
405 compat_str(v),
406 restricted=self.params.get('restrictfilenames'),
407 is_id=(k == u'id'))
408 template_dict = dict((k, sanitize(k, v))
409 for k, v in template_dict.items()
410 if v is not None)
411 template_dict = collections.defaultdict(lambda: u'NA', template_dict)
412
413 tmpl = os.path.expanduser(self.params['outtmpl'])
414 filename = tmpl % template_dict
415 return filename
416 except ValueError as err:
417 self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
418 return None
419
420 def _match_entry(self, info_dict):
421 """ Returns None iff the file should be downloaded """
422
423 video_title = info_dict.get('title', info_dict.get('id', u'video'))
424 if 'title' in info_dict:
425 # This can happen when we're just evaluating the playlist
426 title = info_dict['title']
427 matchtitle = self.params.get('matchtitle', False)
428 if matchtitle:
429 if not re.search(matchtitle, title, re.IGNORECASE):
430 return u'"' + title + '" title did not match pattern "' + matchtitle + '"'
431 rejecttitle = self.params.get('rejecttitle', False)
432 if rejecttitle:
433 if re.search(rejecttitle, title, re.IGNORECASE):
434 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
435 date = info_dict.get('upload_date', None)
436 if date is not None:
437 dateRange = self.params.get('daterange', DateRange())
438 if date not in dateRange:
439 return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
440 view_count = info_dict.get('view_count', None)
441 if view_count is not None:
442 min_views = self.params.get('min_views')
443 if min_views is not None and view_count < min_views:
444 return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
445 max_views = self.params.get('max_views')
446 if max_views is not None and view_count > max_views:
447 return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
448 age_limit = self.params.get('age_limit')
449 if age_limit is not None:
450 if age_limit < info_dict.get('age_limit', 0):
451 return u'Skipping "' + title + '" because it is age restricted'
452 if self.in_download_archive(info_dict):
453 return u'%s has already been recorded in archive' % video_title
454 return None
455
456 @staticmethod
457 def add_extra_info(info_dict, extra_info):
458 '''Set the keys from extra_info in info dict if they are missing'''
459 for key, value in extra_info.items():
460 info_dict.setdefault(key, value)
461
462 def extract_info(self, url, download=True, ie_key=None, extra_info={},
463 process=True):
464 '''
465 Returns a list with a dictionary for each video we find.
466 If 'download', also downloads the videos.
467 extra_info is a dict containing the extra values to add to each result
468 '''
469
470 if ie_key:
471 ies = [self.get_info_extractor(ie_key)]
472 else:
473 ies = self._ies
474
475 for ie in ies:
476 if not ie.suitable(url):
477 continue
478
479 if not ie.working():
480 self.report_warning(u'The program functionality for this site has been marked as broken, '
481 u'and will probably not work.')
482
483 try:
484 ie_result = ie.extract(url)
485 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
486 break
487 if isinstance(ie_result, list):
488 # Backwards compatibility: old IE result format
489 ie_result = {
490 '_type': 'compat_list',
491 'entries': ie_result,
492 }
493 self.add_extra_info(ie_result,
494 {
495 'extractor': ie.IE_NAME,
496 'webpage_url': url,
497 'webpage_url_basename': url_basename(url),
498 'extractor_key': ie.ie_key(),
499 })
500 if process:
501 return self.process_ie_result(ie_result, download, extra_info)
502 else:
503 return ie_result
504 except ExtractorError as de: # An error we somewhat expected
505 self.report_error(compat_str(de), de.format_traceback())
506 break
507 except Exception as e:
508 if self.params.get('ignoreerrors', False):
509 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
510 break
511 else:
512 raise
513 else:
514 self.report_error(u'no suitable InfoExtractor: %s' % url)
515
516 def process_ie_result(self, ie_result, download=True, extra_info={}):
517 """
518 Take the result of the ie(may be modified) and resolve all unresolved
519 references (URLs, playlist items).
520
521 It will also download the videos if 'download'.
522 Returns the resolved ie_result.
523 """
524
525 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
526 if result_type == 'video':
527 self.add_extra_info(ie_result, extra_info)
528 return self.process_video_result(ie_result, download=download)
529 elif result_type == 'url':
530 # We have to add extra_info to the results because it may be
531 # contained in a playlist
532 return self.extract_info(ie_result['url'],
533 download,
534 ie_key=ie_result.get('ie_key'),
535 extra_info=extra_info)
536 elif result_type == 'url_transparent':
537 # Use the information from the embedding page
538 info = self.extract_info(
539 ie_result['url'], ie_key=ie_result.get('ie_key'),
540 extra_info=extra_info, download=False, process=False)
541
542 def make_result(embedded_info):
543 new_result = ie_result.copy()
544 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
545 'entries', 'urlhandle', 'ie_key', 'duration',
546 'subtitles', 'annotations', 'format',
547 'thumbnail', 'thumbnails'):
548 if f in new_result:
549 del new_result[f]
550 if f in embedded_info:
551 new_result[f] = embedded_info[f]
552 return new_result
553 new_result = make_result(info)
554
555 assert new_result.get('_type') != 'url_transparent'
556 if new_result.get('_type') == 'compat_list':
557 new_result['entries'] = [
558 make_result(e) for e in new_result['entries']]
559
560 return self.process_ie_result(
561 new_result, download=download, extra_info=extra_info)
562 elif result_type == 'playlist':
563 # We process each entry in the playlist
564 playlist = ie_result.get('title', None) or ie_result.get('id', None)
565 self.to_screen(u'[download] Downloading playlist: %s' % playlist)
566
567 playlist_results = []
568
569 n_all_entries = len(ie_result['entries'])
570 playliststart = self.params.get('playliststart', 1) - 1
571 playlistend = self.params.get('playlistend', None)
572 # For backwards compatibility, interpret -1 as whole list
573 if playlistend == -1:
574 playlistend = None
575
576 entries = ie_result['entries'][playliststart:playlistend]
577 n_entries = len(entries)
578
579 self.to_screen(
580 u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
581 (ie_result['extractor'], playlist, n_all_entries, n_entries))
582
583 for i, entry in enumerate(entries, 1):
584 self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries))
585 extra = {
586 'playlist': playlist,
587 'playlist_index': i + playliststart,
588 'extractor': ie_result['extractor'],
589 'webpage_url': ie_result['webpage_url'],
590 'webpage_url_basename': url_basename(ie_result['webpage_url']),
591 'extractor_key': ie_result['extractor_key'],
592 }
593
594 reason = self._match_entry(entry)
595 if reason is not None:
596 self.to_screen(u'[download] ' + reason)
597 continue
598
599 entry_result = self.process_ie_result(entry,
600 download=download,
601 extra_info=extra)
602 playlist_results.append(entry_result)
603 ie_result['entries'] = playlist_results
604 return ie_result
605 elif result_type == 'compat_list':
606 def _fixup(r):
607 self.add_extra_info(r,
608 {
609 'extractor': ie_result['extractor'],
610 'webpage_url': ie_result['webpage_url'],
611 'webpage_url_basename': url_basename(ie_result['webpage_url']),
612 'extractor_key': ie_result['extractor_key'],
613 })
614 return r
615 ie_result['entries'] = [
616 self.process_ie_result(_fixup(r), download, extra_info)
617 for r in ie_result['entries']
618 ]
619 return ie_result
620 else:
621 raise Exception('Invalid result type: %s' % result_type)
622
623 def select_format(self, format_spec, available_formats):
624 if format_spec == 'best' or format_spec is None:
625 return available_formats[-1]
626 elif format_spec == 'worst':
627 return available_formats[0]
628 else:
629 extensions = [u'mp4', u'flv', u'webm', u'3gp']
630 if format_spec in extensions:
631 filter_f = lambda f: f['ext'] == format_spec
632 else:
633 filter_f = lambda f: f['format_id'] == format_spec
634 matches = list(filter(filter_f, available_formats))
635 if matches:
636 return matches[-1]
637 return None
638
639 def process_video_result(self, info_dict, download=True):
640 assert info_dict.get('_type', 'video') == 'video'
641
642 if 'playlist' not in info_dict:
643 # It isn't part of a playlist
644 info_dict['playlist'] = None
645 info_dict['playlist_index'] = None
646
647 # This extractors handle format selection themselves
648 if info_dict['extractor'] in [u'Youku']:
649 if download:
650 self.process_info(info_dict)
651 return info_dict
652
653 # We now pick which formats have to be downloaded
654 if info_dict.get('formats') is None:
655 # There's only one format available
656 formats = [info_dict]
657 else:
658 formats = info_dict['formats']
659
660 # We check that all the formats have the format and format_id fields
661 for (i, format) in enumerate(formats):
662 if format.get('format_id') is None:
663 format['format_id'] = compat_str(i)
664 if format.get('format') is None:
665 format['format'] = u'{id} - {res}{note}'.format(
666 id=format['format_id'],
667 res=self.format_resolution(format),
668 note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
669 )
670 # Automatically determine file extension if missing
671 if 'ext' not in format:
672 format['ext'] = determine_ext(format['url'])
673
674 format_limit = self.params.get('format_limit', None)
675 if format_limit:
676 formats = list(takewhile_inclusive(
677 lambda f: f['format_id'] != format_limit, formats
678 ))
679 if self.params.get('prefer_free_formats'):
680 def _free_formats_key(f):
681 try:
682 ext_ord = [u'flv', u'mp4', u'webm'].index(f['ext'])
683 except ValueError:
684 ext_ord = -1
685 # We only compare the extension if they have the same height and width
686 return (f.get('height') if f.get('height') is not None else -1,
687 f.get('width') if f.get('width') is not None else -1,
688 ext_ord)
689 formats = sorted(formats, key=_free_formats_key)
690
691 if formats[0] is not info_dict:
692 # only set the 'formats' fields if the original info_dict list them
693 # otherwise we end up with a circular reference, the first (and unique)
694 # element in the 'formats' field in info_dict is info_dict itself,
695 # wich can't be exported to json
696 info_dict['formats'] = formats
697 if self.params.get('listformats', None):
698 self.list_formats(info_dict)
699 return
700
701 req_format = self.params.get('format', 'best')
702 if req_format is None:
703 req_format = 'best'
704 formats_to_download = []
705 # The -1 is for supporting YoutubeIE
706 if req_format in ('-1', 'all'):
707 formats_to_download = formats
708 else:
709 # We can accept formats requestd in the format: 34/5/best, we pick
710 # the first that is available, starting from left
711 req_formats = req_format.split('/')
712 for rf in req_formats:
713 selected_format = self.select_format(rf, formats)
714 if selected_format is not None:
715 formats_to_download = [selected_format]
716 break
717 if not formats_to_download:
718 raise ExtractorError(u'requested format not available',
719 expected=True)
720
721 if download:
722 if len(formats_to_download) > 1:
723 self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
724 for format in formats_to_download:
725 new_info = dict(info_dict)
726 new_info.update(format)
727 self.process_info(new_info)
728 # We update the info dict with the best quality format (backwards compatibility)
729 info_dict.update(formats_to_download[-1])
730 return info_dict
731
732 def process_info(self, info_dict):
733 """Process a single resolved IE result."""
734
735 assert info_dict.get('_type', 'video') == 'video'
736 #We increment the download the download count here to match the previous behaviour.
737 self.increment_downloads()
738
739 info_dict['fulltitle'] = info_dict['title']
740 if len(info_dict['title']) > 200:
741 info_dict['title'] = info_dict['title'][:197] + u'...'
742
743 # Keep for backwards compatibility
744 info_dict['stitle'] = info_dict['title']
745
746 if not 'format' in info_dict:
747 info_dict['format'] = info_dict['ext']
748
749 reason = self._match_entry(info_dict)
750 if reason is not None:
751 self.to_screen(u'[download] ' + reason)
752 return
753
754 max_downloads = self.params.get('max_downloads')
755 if max_downloads is not None:
756 if self._num_downloads > int(max_downloads):
757 raise MaxDownloadsReached()
758
759 filename = self.prepare_filename(info_dict)
760
761 # Forced printings
762 if self.params.get('forcetitle', False):
763 self.to_stdout(info_dict['fulltitle'])
764 if self.params.get('forceid', False):
765 self.to_stdout(info_dict['id'])
766 if self.params.get('forceurl', False):
767 # For RTMP URLs, also include the playpath
768 self.to_stdout(info_dict['url'] + info_dict.get('play_path', u''))
769 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
770 self.to_stdout(info_dict['thumbnail'])
771 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
772 self.to_stdout(info_dict['description'])
773 if self.params.get('forcefilename', False) and filename is not None:
774 self.to_stdout(filename)
775 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
776 self.to_stdout(formatSeconds(info_dict['duration']))
777 if self.params.get('forceformat', False):
778 self.to_stdout(info_dict['format'])
779 if self.params.get('forcejson', False):
780 info_dict['_filename'] = filename
781 self.to_stdout(json.dumps(info_dict))
782
783 # Do nothing else if in simulate mode
784 if self.params.get('simulate', False):
785 return
786
787 if filename is None:
788 return
789
790 try:
791 dn = os.path.dirname(encodeFilename(filename))
792 if dn != '' and not os.path.exists(dn):
793 os.makedirs(dn)
794 except (OSError, IOError) as err:
795 self.report_error(u'unable to create directory ' + compat_str(err))
796 return
797
798 if self.params.get('writedescription', False):
799 descfn = filename + u'.description'
800 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
801 self.to_screen(u'[info] Video description is already present')
802 else:
803 try:
804 self.to_screen(u'[info] Writing video description to: ' + descfn)
805 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
806 descfile.write(info_dict['description'])
807 except (KeyError, TypeError):
808 self.report_warning(u'There\'s no description to write.')
809 except (OSError, IOError):
810 self.report_error(u'Cannot write description file ' + descfn)
811 return
812
813 if self.params.get('writeannotations', False):
814 annofn = filename + u'.annotations.xml'
815 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
816 self.to_screen(u'[info] Video annotations are already present')
817 else:
818 try:
819 self.to_screen(u'[info] Writing video annotations to: ' + annofn)
820 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
821 annofile.write(info_dict['annotations'])
822 except (KeyError, TypeError):
823 self.report_warning(u'There are no annotations to write.')
824 except (OSError, IOError):
825 self.report_error(u'Cannot write annotations file: ' + annofn)
826 return
827
828 subtitles_are_requested = any([self.params.get('writesubtitles', False),
829 self.params.get('writeautomaticsub')])
830
831 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
832 # subtitles download errors are already managed as troubles in relevant IE
833 # that way it will silently go on when used with unsupporting IE
834 subtitles = info_dict['subtitles']
835 sub_format = self.params.get('subtitlesformat', 'srt')
836 for sub_lang in subtitles.keys():
837 sub = subtitles[sub_lang]
838 if sub is None:
839 continue
840 try:
841 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
842 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
843 self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
844 else:
845 self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
846 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
847 subfile.write(sub)
848 except (OSError, IOError):
849 self.report_error(u'Cannot write subtitles file ' + descfn)
850 return
851
852 if self.params.get('writeinfojson', False):
853 infofn = os.path.splitext(filename)[0] + u'.info.json'
854 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
855 self.to_screen(u'[info] Video description metadata is already present')
856 else:
857 self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn)
858 try:
859 json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
860 write_json_file(json_info_dict, encodeFilename(infofn))
861 except (OSError, IOError):
862 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
863 return
864
865 if self.params.get('writethumbnail', False):
866 if info_dict.get('thumbnail') is not None:
867 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
868 thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
869 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
870 self.to_screen(u'[%s] %s: Thumbnail is already present' %
871 (info_dict['extractor'], info_dict['id']))
872 else:
873 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
874 (info_dict['extractor'], info_dict['id']))
875 try:
876 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
877 with open(thumb_filename, 'wb') as thumbf:
878 shutil.copyfileobj(uf, thumbf)
879 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
880 (info_dict['extractor'], info_dict['id'], thumb_filename))
881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
882 self.report_warning(u'Unable to download thumbnail "%s": %s' %
883 (info_dict['thumbnail'], compat_str(err)))
884
885 if not self.params.get('skip_download', False):
886 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
887 success = True
888 else:
889 try:
890 fd = get_suitable_downloader(info_dict)(self, self.params)
891 for ph in self._fd_progress_hooks:
892 fd.add_progress_hook(ph)
893 success = fd.download(filename, info_dict)
894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
895 self.report_error(u'unable to download video data: %s' % str(err))
896 return
897 except (OSError, IOError) as err:
898 raise UnavailableVideoError(err)
899 except (ContentTooShortError, ) as err:
900 self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
901 return
902
903 if success:
904 try:
905 self.post_process(filename, info_dict)
906 except (PostProcessingError) as err:
907 self.report_error(u'postprocessing: %s' % str(err))
908 return
909
910 self.record_download_archive(info_dict)
911
912 def download(self, url_list):
913 """Download a given list of URLs."""
914 if (len(url_list) > 1 and
915 '%' not in self.params['outtmpl']
916 and self.params.get('max_downloads') != 1):
917 raise SameFileError(self.params['outtmpl'])
918
919 for url in url_list:
920 try:
921 #It also downloads the videos
922 self.extract_info(url)
923 except UnavailableVideoError:
924 self.report_error(u'unable to download video')
925 except MaxDownloadsReached:
926 self.to_screen(u'[info] Maximum number of downloaded files reached.')
927 raise
928
929 return self._download_retcode
930
931 def download_with_info_file(self, info_filename):
932 with io.open(info_filename, 'r', encoding='utf-8') as f:
933 info = json.load(f)
934 try:
935 self.process_ie_result(info, download=True)
936 except DownloadError:
937 webpage_url = info.get('webpage_url')
938 if webpage_url is not None:
939 self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url)
940 return self.download([webpage_url])
941 else:
942 raise
943 return self._download_retcode
944
945 def post_process(self, filename, ie_info):
946 """Run all the postprocessors on the given file."""
947 info = dict(ie_info)
948 info['filepath'] = filename
949 keep_video = None
950 for pp in self._pps:
951 try:
952 keep_video_wish, new_info = pp.run(info)
953 if keep_video_wish is not None:
954 if keep_video_wish:
955 keep_video = keep_video_wish
956 elif keep_video is None:
957 # No clear decision yet, let IE decide
958 keep_video = keep_video_wish
959 except PostProcessingError as e:
960 self.report_error(e.msg)
961 if keep_video is False and not self.params.get('keepvideo', False):
962 try:
963 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
964 os.remove(encodeFilename(filename))
965 except (IOError, OSError):
966 self.report_warning(u'Unable to remove downloaded video file')
967
968 def _make_archive_id(self, info_dict):
969 # Future-proof against any change in case
970 # and backwards compatibility with prior versions
971 extractor = info_dict.get('extractor_key')
972 if extractor is None:
973 if 'id' in info_dict:
974 extractor = info_dict.get('ie_key') # key in a playlist
975 if extractor is None:
976 return None # Incomplete video information
977 return extractor.lower() + u' ' + info_dict['id']
978
979 def in_download_archive(self, info_dict):
980 fn = self.params.get('download_archive')
981 if fn is None:
982 return False
983
984 vid_id = self._make_archive_id(info_dict)
985 if vid_id is None:
986 return False # Incomplete video information
987
988 try:
989 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
990 for line in archive_file:
991 if line.strip() == vid_id:
992 return True
993 except IOError as ioe:
994 if ioe.errno != errno.ENOENT:
995 raise
996 return False
997
998 def record_download_archive(self, info_dict):
999 fn = self.params.get('download_archive')
1000 if fn is None:
1001 return
1002 vid_id = self._make_archive_id(info_dict)
1003 assert vid_id
1004 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1005 archive_file.write(vid_id + u'\n')
1006
1007 @staticmethod
1008 def format_resolution(format, default='unknown'):
1009 if format.get('vcodec') == 'none':
1010 return 'audio only'
1011 if format.get('_resolution') is not None:
1012 return format['_resolution']
1013 if format.get('height') is not None:
1014 if format.get('width') is not None:
1015 res = u'%sx%s' % (format['width'], format['height'])
1016 else:
1017 res = u'%sp' % format['height']
1018 else:
1019 res = default
1020 return res
1021
1022 def list_formats(self, info_dict):
1023 def format_note(fdict):
1024 res = u''
1025 if fdict.get('format_note') is not None:
1026 res += fdict['format_note'] + u' '
1027 if (fdict.get('vcodec') is not None and
1028 fdict.get('vcodec') != 'none'):
1029 res += u'%-5s' % fdict['vcodec']
1030 elif fdict.get('vbr') is not None:
1031 res += u'video'
1032 if fdict.get('vbr') is not None:
1033 res += u'@%4dk' % fdict['vbr']
1034 if fdict.get('acodec') is not None:
1035 if res:
1036 res += u', '
1037 res += u'%-5s' % fdict['acodec']
1038 elif fdict.get('abr') is not None:
1039 if res:
1040 res += u', '
1041 res += 'audio'
1042 if fdict.get('abr') is not None:
1043 res += u'@%3dk' % fdict['abr']
1044 if fdict.get('filesize') is not None:
1045 if res:
1046 res += u', '
1047 res += format_bytes(fdict['filesize'])
1048 return res
1049
1050 def line(format, idlen=20):
1051 return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
1052 format['format_id'],
1053 format['ext'],
1054 self.format_resolution(format),
1055 format_note(format),
1056 ))
1057
1058 formats = info_dict.get('formats', [info_dict])
1059 idlen = max(len(u'format code'),
1060 max(len(f['format_id']) for f in formats))
1061 formats_s = [line(f, idlen) for f in formats]
1062 if len(formats) > 1:
1063 formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1064 formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1065
1066 header_line = line({
1067 'format_id': u'format code', 'ext': u'extension',
1068 '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
1069 self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
1070 (info_dict['id'], header_line, u"\n".join(formats_s)))
1071
1072 def urlopen(self, req):
1073 """ Start an HTTP download """
1074 return self._opener.open(req)
1075
1076 def print_debug_header(self):
1077 if not self.params.get('verbose'):
1078 return
1079 write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
1080 try:
1081 sp = subprocess.Popen(
1082 ['git', 'rev-parse', '--short', 'HEAD'],
1083 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1084 cwd=os.path.dirname(os.path.abspath(__file__)))
1085 out, err = sp.communicate()
1086 out = out.decode().strip()
1087 if re.match('[0-9a-f]+', out):
1088 write_string(u'[debug] Git HEAD: ' + out + u'\n')
1089 except:
1090 try:
1091 sys.exc_clear()
1092 except:
1093 pass
1094 write_string(u'[debug] Python version %s - %s' %
1095 (platform.python_version(), platform_name()) + u'\n')
1096
1097 proxy_map = {}
1098 for handler in self._opener.handlers:
1099 if hasattr(handler, 'proxies'):
1100 proxy_map.update(handler.proxies)
1101 write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
1102
1103 def _setup_opener(self):
1104 timeout_val = self.params.get('socket_timeout')
1105 timeout = 600 if timeout_val is None else float(timeout_val)
1106
1107 opts_cookiefile = self.params.get('cookiefile')
1108 opts_proxy = self.params.get('proxy')
1109
1110 if opts_cookiefile is None:
1111 self.cookiejar = compat_cookiejar.CookieJar()
1112 else:
1113 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1114 opts_cookiefile)
1115 if os.access(opts_cookiefile, os.R_OK):
1116 self.cookiejar.load()
1117
1118 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1119 self.cookiejar)
1120 if opts_proxy is not None:
1121 if opts_proxy == '':
1122 proxies = {}
1123 else:
1124 proxies = {'http': opts_proxy, 'https': opts_proxy}
1125 else:
1126 proxies = compat_urllib_request.getproxies()
1127 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1128 if 'http' in proxies and 'https' not in proxies:
1129 proxies['https'] = proxies['http']
1130 proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1131 https_handler = make_HTTPS_handler(
1132 self.params.get('nocheckcertificate', False))
1133 opener = compat_urllib_request.build_opener(
1134 https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
1135 # Delete the default user-agent header, which would otherwise apply in
1136 # cases where our custom HTTP handler doesn't come into play
1137 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1138 opener.addheaders = []
1139 self._opener = opener
1140
1141 # TODO remove this global modification
1142 compat_urllib_request.install_opener(opener)
1143 socket.setdefaulttimeout(timeout)