]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
Merge remote-tracking branch 'jaimeMF/merge-formats'
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import errno
8 import io
9 import json
10 import os
11 import platform
12 import re
13 import shutil
14 import subprocess
15 import socket
16 import sys
17 import time
18 import traceback
19
20 if os.name == 'nt':
21 import ctypes
22
23 from .utils import (
24 compat_cookiejar,
25 compat_http_client,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_request,
29 ContentTooShortError,
30 date_from_str,
31 DateRange,
32 determine_ext,
33 DownloadError,
34 encodeFilename,
35 ExtractorError,
36 format_bytes,
37 formatSeconds,
38 get_term_width,
39 locked_file,
40 make_HTTPS_handler,
41 MaxDownloadsReached,
42 PostProcessingError,
43 platform_name,
44 preferredencoding,
45 SameFileError,
46 sanitize_filename,
47 subtitles_filename,
48 takewhile_inclusive,
49 UnavailableVideoError,
50 url_basename,
51 write_json_file,
52 write_string,
53 YoutubeDLHandler,
54 prepend_extension,
55 )
56 from .extractor import get_info_extractor, gen_extractors
57 from .downloader import get_suitable_downloader
58 from .PostProcessor import FFmpegMergerPP
59 from .version import __version__
60
61
62 class YoutubeDL(object):
63 """YoutubeDL class.
64
65 YoutubeDL objects are the ones responsible of downloading the
66 actual video file and writing it to disk if the user has requested
67 it, among some other tasks. In most cases there should be one per
68 program. As, given a video URL, the downloader doesn't know how to
69 extract all the needed information, task that InfoExtractors do, it
70 has to pass the URL to one of them.
71
72 For this, YoutubeDL objects have a method that allows
73 InfoExtractors to be registered in a given order. When it is passed
74 a URL, the YoutubeDL object handles it to the first InfoExtractor it
75 finds that reports being able to handle it. The InfoExtractor extracts
76 all the information about the video or videos the URL refers to, and
77 YoutubeDL process the extracted information, possibly using a File
78 Downloader to download the video.
79
80 YoutubeDL objects accept a lot of parameters. In order not to saturate
81 the object constructor with arguments, it receives a dictionary of
82 options instead. These options are available through the params
83 attribute for the InfoExtractors to use. The YoutubeDL also
84 registers itself as the downloader in charge for the InfoExtractors
85 that are added to it, so this is a "mutual registration".
86
87 Available options:
88
89 username: Username for authentication purposes.
90 password: Password for authentication purposes.
91 videopassword: Password for acces a video.
92 usenetrc: Use netrc for authentication instead.
93 verbose: Print additional info to stdout.
94 quiet: Do not print messages to stdout.
95 forceurl: Force printing final URL.
96 forcetitle: Force printing title.
97 forceid: Force printing ID.
98 forcethumbnail: Force printing thumbnail URL.
99 forcedescription: Force printing description.
100 forcefilename: Force printing final filename.
101 forceduration: Force printing duration.
102 forcejson: Force printing info_dict as JSON.
103 simulate: Do not download the video files.
104 format: Video format code.
105 format_limit: Highest quality format to try.
106 outtmpl: Template for output names.
107 restrictfilenames: Do not allow "&" and spaces in file names
108 ignoreerrors: Do not stop on download errors.
109 nooverwrites: Prevent overwriting files.
110 playliststart: Playlist item to start at.
111 playlistend: Playlist item to end at.
112 matchtitle: Download only matching titles.
113 rejecttitle: Reject downloads for matching titles.
114 logger: Log messages to a logging.Logger instance.
115 logtostderr: Log messages to stderr instead of stdout.
116 writedescription: Write the video description to a .description file
117 writeinfojson: Write the video description to a .info.json file
118 writeannotations: Write the video annotations to a .annotations.xml file
119 writethumbnail: Write the thumbnail image to a file
120 writesubtitles: Write the video subtitles to a file
121 writeautomaticsub: Write the automatic subtitles to a file
122 allsubtitles: Downloads all the subtitles of the video
123 (requires writesubtitles or writeautomaticsub)
124 listsubtitles: Lists all available subtitles for the video
125 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
126 subtitleslangs: List of languages of the subtitles to download
127 keepvideo: Keep the video file after post-processing
128 daterange: A DateRange object, download only if the upload_date is in the range.
129 skip_download: Skip the actual download of the video file
130 cachedir: Location of the cache files in the filesystem.
131 None to disable filesystem cache.
132 noplaylist: Download single video instead of a playlist if in doubt.
133 age_limit: An integer representing the user's age in years.
134 Unsuitable videos for the given age are skipped.
135 min_views: An integer representing the minimum view count the video
136 must have in order to not be skipped.
137 Videos without view count information are always
138 downloaded. None for no limit.
139 max_views: An integer representing the maximum view count.
140 Videos that are more popular than that are not
141 downloaded.
142 Videos without view count information are always
143 downloaded. None for no limit.
144 download_archive: File name of a file where all downloads are recorded.
145 Videos already present in the file are not downloaded
146 again.
147 cookiefile: File name where cookies should be read from and dumped to.
148 nocheckcertificate:Do not verify SSL certificates
149 proxy: URL of the proxy server to use
150 socket_timeout: Time to wait for unresponsive hosts, in seconds
151 bidi_workaround: Work around buggy terminals without bidirectional text
152 support, using fridibi
153 debug_printtraffic:Print out sent and received HTTP traffic
154
155 The following parameters are not used by YoutubeDL itself, they are used by
156 the FileDownloader:
157 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
158 noresizebuffer, retries, continuedl, noprogress, consoletitle
159 """
160
161 params = None
162 _ies = []
163 _pps = []
164 _download_retcode = None
165 _num_downloads = None
166 _screen_file = None
167
168 def __init__(self, params=None):
169 """Create a FileDownloader object with the given options."""
170 if params is None:
171 params = {}
172 self._ies = []
173 self._ies_instances = {}
174 self._pps = []
175 self._progress_hooks = []
176 self._download_retcode = 0
177 self._num_downloads = 0
178 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
179 self._err_file = sys.stderr
180 self.params = params
181
182 if params.get('bidi_workaround', False):
183 try:
184 import pty
185 master, slave = pty.openpty()
186 width = get_term_width()
187 if width is None:
188 width_args = []
189 else:
190 width_args = ['-w', str(width)]
191 sp_kwargs = dict(
192 stdin=subprocess.PIPE,
193 stdout=slave,
194 stderr=self._err_file)
195 try:
196 self._output_process = subprocess.Popen(
197 ['bidiv'] + width_args, **sp_kwargs
198 )
199 except OSError:
200 self._output_process = subprocess.Popen(
201 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
202 self._output_channel = os.fdopen(master, 'rb')
203 except OSError as ose:
204 if ose.errno == 2:
205 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
206 else:
207 raise
208
209 if (sys.version_info >= (3,) and sys.platform != 'win32' and
210 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
211 and not params['restrictfilenames']):
212 # On Python 3, the Unicode filesystem API will throw errors (#1474)
213 self.report_warning(
214 'Assuming --restrict-filenames since file system encoding '
215 'cannot encode all charactes. '
216 'Set the LC_ALL environment variable to fix this.')
217 self.params['restrictfilenames'] = True
218
219 if '%(stitle)s' in self.params.get('outtmpl', ''):
220 self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
221
222 self._setup_opener()
223
224 def add_info_extractor(self, ie):
225 """Add an InfoExtractor object to the end of the list."""
226 self._ies.append(ie)
227 self._ies_instances[ie.ie_key()] = ie
228 ie.set_downloader(self)
229
230 def get_info_extractor(self, ie_key):
231 """
232 Get an instance of an IE with name ie_key, it will try to get one from
233 the _ies list, if there's no instance it will create a new one and add
234 it to the extractor list.
235 """
236 ie = self._ies_instances.get(ie_key)
237 if ie is None:
238 ie = get_info_extractor(ie_key)()
239 self.add_info_extractor(ie)
240 return ie
241
242 def add_default_info_extractors(self):
243 """
244 Add the InfoExtractors returned by gen_extractors to the end of the list
245 """
246 for ie in gen_extractors():
247 self.add_info_extractor(ie)
248
249 def add_post_processor(self, pp):
250 """Add a PostProcessor object to the end of the chain."""
251 self._pps.append(pp)
252 pp.set_downloader(self)
253
254 def add_progress_hook(self, ph):
255 """Add the progress hook (currently only for the file downloader)"""
256 self._progress_hooks.append(ph)
257
258 def _bidi_workaround(self, message):
259 if not hasattr(self, '_output_channel'):
260 return message
261
262 assert hasattr(self, '_output_process')
263 assert type(message) == type('')
264 line_count = message.count('\n') + 1
265 self._output_process.stdin.write((message + '\n').encode('utf-8'))
266 self._output_process.stdin.flush()
267 res = ''.join(self._output_channel.readline().decode('utf-8')
268 for _ in range(line_count))
269 return res[:-len('\n')]
270
271 def to_screen(self, message, skip_eol=False):
272 """Print message to stdout if not in quiet mode."""
273 return self.to_stdout(message, skip_eol, check_quiet=True)
274
275 def to_stdout(self, message, skip_eol=False, check_quiet=False):
276 """Print message to stdout if not in quiet mode."""
277 if self.params.get('logger'):
278 self.params['logger'].debug(message)
279 elif not check_quiet or not self.params.get('quiet', False):
280 message = self._bidi_workaround(message)
281 terminator = ['\n', ''][skip_eol]
282 output = message + terminator
283
284 write_string(output, self._screen_file)
285
286 def to_stderr(self, message):
287 """Print message to stderr."""
288 assert type(message) == type('')
289 if self.params.get('logger'):
290 self.params['logger'].error(message)
291 else:
292 message = self._bidi_workaround(message)
293 output = message + '\n'
294 write_string(output, self._err_file)
295
296 def to_console_title(self, message):
297 if not self.params.get('consoletitle', False):
298 return
299 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
300 # c_wchar_p() might not be necessary if `message` is
301 # already of type unicode()
302 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
303 elif 'TERM' in os.environ:
304 write_string('\033]0;%s\007' % message, self._screen_file)
305
306 def save_console_title(self):
307 if not self.params.get('consoletitle', False):
308 return
309 if 'TERM' in os.environ:
310 # Save the title on stack
311 write_string('\033[22;0t', self._screen_file)
312
313 def restore_console_title(self):
314 if not self.params.get('consoletitle', False):
315 return
316 if 'TERM' in os.environ:
317 # Restore the title from stack
318 write_string('\033[23;0t', self._screen_file)
319
320 def __enter__(self):
321 self.save_console_title()
322 return self
323
324 def __exit__(self, *args):
325 self.restore_console_title()
326
327 if self.params.get('cookiefile') is not None:
328 self.cookiejar.save()
329
330 def trouble(self, message=None, tb=None):
331 """Determine action to take when a download problem appears.
332
333 Depending on if the downloader has been configured to ignore
334 download errors or not, this method may throw an exception or
335 not when errors are found, after printing the message.
336
337 tb, if given, is additional traceback information.
338 """
339 if message is not None:
340 self.to_stderr(message)
341 if self.params.get('verbose'):
342 if tb is None:
343 if sys.exc_info()[0]: # if .trouble has been called from an except block
344 tb = ''
345 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
346 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
347 tb += compat_str(traceback.format_exc())
348 else:
349 tb_data = traceback.format_list(traceback.extract_stack())
350 tb = ''.join(tb_data)
351 self.to_stderr(tb)
352 if not self.params.get('ignoreerrors', False):
353 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
354 exc_info = sys.exc_info()[1].exc_info
355 else:
356 exc_info = sys.exc_info()
357 raise DownloadError(message, exc_info)
358 self._download_retcode = 1
359
360 def report_warning(self, message):
361 '''
362 Print the message to stderr, it will be prefixed with 'WARNING:'
363 If stderr is a tty file the 'WARNING:' will be colored
364 '''
365 if self._err_file.isatty() and os.name != 'nt':
366 _msg_header = '\033[0;33mWARNING:\033[0m'
367 else:
368 _msg_header = 'WARNING:'
369 warning_message = '%s %s' % (_msg_header, message)
370 self.to_stderr(warning_message)
371
372 def report_error(self, message, tb=None):
373 '''
374 Do the same as trouble, but prefixes the message with 'ERROR:', colored
375 in red if stderr is a tty file.
376 '''
377 if self._err_file.isatty() and os.name != 'nt':
378 _msg_header = '\033[0;31mERROR:\033[0m'
379 else:
380 _msg_header = 'ERROR:'
381 error_message = '%s %s' % (_msg_header, message)
382 self.trouble(error_message, tb)
383
384 def report_file_already_downloaded(self, file_name):
385 """Report file has already been fully downloaded."""
386 try:
387 self.to_screen('[download] %s has already been downloaded' % file_name)
388 except UnicodeEncodeError:
389 self.to_screen('[download] The file has already been downloaded')
390
391 def increment_downloads(self):
392 """Increment the ordinal that assigns a number to each file."""
393 self._num_downloads += 1
394
395 def prepare_filename(self, info_dict):
396 """Generate the output filename."""
397 try:
398 template_dict = dict(info_dict)
399
400 template_dict['epoch'] = int(time.time())
401 autonumber_size = self.params.get('autonumber_size')
402 if autonumber_size is None:
403 autonumber_size = 5
404 autonumber_templ = '%0' + str(autonumber_size) + 'd'
405 template_dict['autonumber'] = autonumber_templ % self._num_downloads
406 if template_dict.get('playlist_index') is not None:
407 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
408
409 sanitize = lambda k, v: sanitize_filename(
410 compat_str(v),
411 restricted=self.params.get('restrictfilenames'),
412 is_id=(k == 'id'))
413 template_dict = dict((k, sanitize(k, v))
414 for k, v in template_dict.items()
415 if v is not None)
416 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
417
418 tmpl = os.path.expanduser(self.params['outtmpl'])
419 filename = tmpl % template_dict
420 return filename
421 except ValueError as err:
422 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
423 return None
424
425 def _match_entry(self, info_dict):
426 """ Returns None iff the file should be downloaded """
427
428 video_title = info_dict.get('title', info_dict.get('id', 'video'))
429 if 'title' in info_dict:
430 # This can happen when we're just evaluating the playlist
431 title = info_dict['title']
432 matchtitle = self.params.get('matchtitle', False)
433 if matchtitle:
434 if not re.search(matchtitle, title, re.IGNORECASE):
435 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
436 rejecttitle = self.params.get('rejecttitle', False)
437 if rejecttitle:
438 if re.search(rejecttitle, title, re.IGNORECASE):
439 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
440 date = info_dict.get('upload_date', None)
441 if date is not None:
442 dateRange = self.params.get('daterange', DateRange())
443 if date not in dateRange:
444 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
445 view_count = info_dict.get('view_count', None)
446 if view_count is not None:
447 min_views = self.params.get('min_views')
448 if min_views is not None and view_count < min_views:
449 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
450 max_views = self.params.get('max_views')
451 if max_views is not None and view_count > max_views:
452 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
453 age_limit = self.params.get('age_limit')
454 if age_limit is not None:
455 if age_limit < info_dict.get('age_limit', 0):
456 return 'Skipping "' + title + '" because it is age restricted'
457 if self.in_download_archive(info_dict):
458 return '%s has already been recorded in archive' % video_title
459 return None
460
461 @staticmethod
462 def add_extra_info(info_dict, extra_info):
463 '''Set the keys from extra_info in info dict if they are missing'''
464 for key, value in extra_info.items():
465 info_dict.setdefault(key, value)
466
467 def extract_info(self, url, download=True, ie_key=None, extra_info={},
468 process=True):
469 '''
470 Returns a list with a dictionary for each video we find.
471 If 'download', also downloads the videos.
472 extra_info is a dict containing the extra values to add to each result
473 '''
474
475 if ie_key:
476 ies = [self.get_info_extractor(ie_key)]
477 else:
478 ies = self._ies
479
480 for ie in ies:
481 if not ie.suitable(url):
482 continue
483
484 if not ie.working():
485 self.report_warning('The program functionality for this site has been marked as broken, '
486 'and will probably not work.')
487
488 try:
489 ie_result = ie.extract(url)
490 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
491 break
492 if isinstance(ie_result, list):
493 # Backwards compatibility: old IE result format
494 ie_result = {
495 '_type': 'compat_list',
496 'entries': ie_result,
497 }
498 self.add_extra_info(ie_result,
499 {
500 'extractor': ie.IE_NAME,
501 'webpage_url': url,
502 'webpage_url_basename': url_basename(url),
503 'extractor_key': ie.ie_key(),
504 })
505 if process:
506 return self.process_ie_result(ie_result, download, extra_info)
507 else:
508 return ie_result
509 except ExtractorError as de: # An error we somewhat expected
510 self.report_error(compat_str(de), de.format_traceback())
511 break
512 except Exception as e:
513 if self.params.get('ignoreerrors', False):
514 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
515 break
516 else:
517 raise
518 else:
519 self.report_error('no suitable InfoExtractor: %s' % url)
520
521 def process_ie_result(self, ie_result, download=True, extra_info={}):
522 """
523 Take the result of the ie(may be modified) and resolve all unresolved
524 references (URLs, playlist items).
525
526 It will also download the videos if 'download'.
527 Returns the resolved ie_result.
528 """
529
530 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
531 if result_type == 'video':
532 self.add_extra_info(ie_result, extra_info)
533 return self.process_video_result(ie_result, download=download)
534 elif result_type == 'url':
535 # We have to add extra_info to the results because it may be
536 # contained in a playlist
537 return self.extract_info(ie_result['url'],
538 download,
539 ie_key=ie_result.get('ie_key'),
540 extra_info=extra_info)
541 elif result_type == 'url_transparent':
542 # Use the information from the embedding page
543 info = self.extract_info(
544 ie_result['url'], ie_key=ie_result.get('ie_key'),
545 extra_info=extra_info, download=False, process=False)
546
547 def make_result(embedded_info):
548 new_result = ie_result.copy()
549 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
550 'entries', 'ie_key', 'duration',
551 'subtitles', 'annotations', 'format',
552 'thumbnail', 'thumbnails'):
553 if f in new_result:
554 del new_result[f]
555 if f in embedded_info:
556 new_result[f] = embedded_info[f]
557 return new_result
558 new_result = make_result(info)
559
560 assert new_result.get('_type') != 'url_transparent'
561 if new_result.get('_type') == 'compat_list':
562 new_result['entries'] = [
563 make_result(e) for e in new_result['entries']]
564
565 return self.process_ie_result(
566 new_result, download=download, extra_info=extra_info)
567 elif result_type == 'playlist':
568 # We process each entry in the playlist
569 playlist = ie_result.get('title', None) or ie_result.get('id', None)
570 self.to_screen('[download] Downloading playlist: %s' % playlist)
571
572 playlist_results = []
573
574 n_all_entries = len(ie_result['entries'])
575 playliststart = self.params.get('playliststart', 1) - 1
576 playlistend = self.params.get('playlistend', None)
577 # For backwards compatibility, interpret -1 as whole list
578 if playlistend == -1:
579 playlistend = None
580
581 entries = ie_result['entries'][playliststart:playlistend]
582 n_entries = len(entries)
583
584 self.to_screen(
585 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
586 (ie_result['extractor'], playlist, n_all_entries, n_entries))
587
588 for i, entry in enumerate(entries, 1):
589 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
590 extra = {
591 'playlist': playlist,
592 'playlist_index': i + playliststart,
593 'extractor': ie_result['extractor'],
594 'webpage_url': ie_result['webpage_url'],
595 'webpage_url_basename': url_basename(ie_result['webpage_url']),
596 'extractor_key': ie_result['extractor_key'],
597 }
598
599 reason = self._match_entry(entry)
600 if reason is not None:
601 self.to_screen('[download] ' + reason)
602 continue
603
604 entry_result = self.process_ie_result(entry,
605 download=download,
606 extra_info=extra)
607 playlist_results.append(entry_result)
608 ie_result['entries'] = playlist_results
609 return ie_result
610 elif result_type == 'compat_list':
611 def _fixup(r):
612 self.add_extra_info(r,
613 {
614 'extractor': ie_result['extractor'],
615 'webpage_url': ie_result['webpage_url'],
616 'webpage_url_basename': url_basename(ie_result['webpage_url']),
617 'extractor_key': ie_result['extractor_key'],
618 })
619 return r
620 ie_result['entries'] = [
621 self.process_ie_result(_fixup(r), download, extra_info)
622 for r in ie_result['entries']
623 ]
624 return ie_result
625 else:
626 raise Exception('Invalid result type: %s' % result_type)
627
628 def select_format(self, format_spec, available_formats):
629 if format_spec == 'best' or format_spec is None:
630 return available_formats[-1]
631 elif format_spec == 'worst':
632 return available_formats[0]
633 else:
634 extensions = ['mp4', 'flv', 'webm', '3gp']
635 if format_spec in extensions:
636 filter_f = lambda f: f['ext'] == format_spec
637 else:
638 filter_f = lambda f: f['format_id'] == format_spec
639 matches = list(filter(filter_f, available_formats))
640 if matches:
641 return matches[-1]
642 return None
643
644 def process_video_result(self, info_dict, download=True):
645 assert info_dict.get('_type', 'video') == 'video'
646
647 if 'playlist' not in info_dict:
648 # It isn't part of a playlist
649 info_dict['playlist'] = None
650 info_dict['playlist_index'] = None
651
652 # This extractors handle format selection themselves
653 if info_dict['extractor'] in ['Youku']:
654 if download:
655 self.process_info(info_dict)
656 return info_dict
657
658 # We now pick which formats have to be downloaded
659 if info_dict.get('formats') is None:
660 # There's only one format available
661 formats = [info_dict]
662 else:
663 formats = info_dict['formats']
664
665 # We check that all the formats have the format and format_id fields
666 for (i, format) in enumerate(formats):
667 if format.get('format_id') is None:
668 format['format_id'] = compat_str(i)
669 if format.get('format') is None:
670 format['format'] = '{id} - {res}{note}'.format(
671 id=format['format_id'],
672 res=self.format_resolution(format),
673 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
674 )
675 # Automatically determine file extension if missing
676 if 'ext' not in format:
677 format['ext'] = determine_ext(format['url'])
678
679 format_limit = self.params.get('format_limit', None)
680 if format_limit:
681 formats = list(takewhile_inclusive(
682 lambda f: f['format_id'] != format_limit, formats
683 ))
684
685 # TODO Central sorting goes here
686
687 if formats[0] is not info_dict:
688 # only set the 'formats' fields if the original info_dict list them
689 # otherwise we end up with a circular reference, the first (and unique)
690 # element in the 'formats' field in info_dict is info_dict itself,
691 # wich can't be exported to json
692 info_dict['formats'] = formats
693 if self.params.get('listformats', None):
694 self.list_formats(info_dict)
695 return
696
697 req_format = self.params.get('format', 'best')
698 if req_format is None:
699 req_format = 'best'
700 formats_to_download = []
701 # The -1 is for supporting YoutubeIE
702 if req_format in ('-1', 'all'):
703 formats_to_download = formats
704 else:
705 # We can accept formats requestd in the format: 34/5/best, we pick
706 # the first that is available, starting from left
707 req_formats = req_format.split('/')
708 for rf in req_formats:
709 if re.match(r'.+?\+.+?', rf) is not None:
710 # Two formats have been requested like '137+139'
711 format_1, format_2 = rf.split('+')
712 formats_info = (self.select_format(format_1, formats),
713 self.select_format(format_2, formats))
714 if all(formats_info):
715 selected_format = {'requested_formats': formats_info}
716 else:
717 selected_format = None
718 else:
719 selected_format = self.select_format(rf, formats)
720 if selected_format is not None:
721 formats_to_download = [selected_format]
722 break
723 if not formats_to_download:
724 raise ExtractorError('requested format not available',
725 expected=True)
726
727 if download:
728 if len(formats_to_download) > 1:
729 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
730 for format in formats_to_download:
731 new_info = dict(info_dict)
732 new_info.update(format)
733 self.process_info(new_info)
734 # We update the info dict with the best quality format (backwards compatibility)
735 info_dict.update(formats_to_download[-1])
736 return info_dict
737
738 def process_info(self, info_dict):
739 """Process a single resolved IE result."""
740
741 assert info_dict.get('_type', 'video') == 'video'
742 #We increment the download the download count here to match the previous behaviour.
743 self.increment_downloads()
744
745 info_dict['fulltitle'] = info_dict['title']
746 if len(info_dict['title']) > 200:
747 info_dict['title'] = info_dict['title'][:197] + '...'
748
749 # Keep for backwards compatibility
750 info_dict['stitle'] = info_dict['title']
751
752 if not 'format' in info_dict:
753 info_dict['format'] = info_dict['ext']
754
755 reason = self._match_entry(info_dict)
756 if reason is not None:
757 self.to_screen('[download] ' + reason)
758 return
759
760 max_downloads = self.params.get('max_downloads')
761 if max_downloads is not None:
762 if self._num_downloads > int(max_downloads):
763 raise MaxDownloadsReached()
764
765 filename = self.prepare_filename(info_dict)
766
767 # Forced printings
768 if self.params.get('forcetitle', False):
769 self.to_stdout(info_dict['fulltitle'])
770 if self.params.get('forceid', False):
771 self.to_stdout(info_dict['id'])
772 if self.params.get('forceurl', False):
773 # For RTMP URLs, also include the playpath
774 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
775 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
776 self.to_stdout(info_dict['thumbnail'])
777 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
778 self.to_stdout(info_dict['description'])
779 if self.params.get('forcefilename', False) and filename is not None:
780 self.to_stdout(filename)
781 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
782 self.to_stdout(formatSeconds(info_dict['duration']))
783 if self.params.get('forceformat', False):
784 self.to_stdout(info_dict['format'])
785 if self.params.get('forcejson', False):
786 info_dict['_filename'] = filename
787 self.to_stdout(json.dumps(info_dict))
788
789 # Do nothing else if in simulate mode
790 if self.params.get('simulate', False):
791 return
792
793 if filename is None:
794 return
795
796 try:
797 dn = os.path.dirname(encodeFilename(filename))
798 if dn != '' and not os.path.exists(dn):
799 os.makedirs(dn)
800 except (OSError, IOError) as err:
801 self.report_error('unable to create directory ' + compat_str(err))
802 return
803
804 if self.params.get('writedescription', False):
805 descfn = filename + '.description'
806 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
807 self.to_screen('[info] Video description is already present')
808 else:
809 try:
810 self.to_screen('[info] Writing video description to: ' + descfn)
811 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
812 descfile.write(info_dict['description'])
813 except (KeyError, TypeError):
814 self.report_warning('There\'s no description to write.')
815 except (OSError, IOError):
816 self.report_error('Cannot write description file ' + descfn)
817 return
818
819 if self.params.get('writeannotations', False):
820 annofn = filename + '.annotations.xml'
821 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
822 self.to_screen('[info] Video annotations are already present')
823 else:
824 try:
825 self.to_screen('[info] Writing video annotations to: ' + annofn)
826 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
827 annofile.write(info_dict['annotations'])
828 except (KeyError, TypeError):
829 self.report_warning('There are no annotations to write.')
830 except (OSError, IOError):
831 self.report_error('Cannot write annotations file: ' + annofn)
832 return
833
834 subtitles_are_requested = any([self.params.get('writesubtitles', False),
835 self.params.get('writeautomaticsub')])
836
837 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
838 # subtitles download errors are already managed as troubles in relevant IE
839 # that way it will silently go on when used with unsupporting IE
840 subtitles = info_dict['subtitles']
841 sub_format = self.params.get('subtitlesformat', 'srt')
842 for sub_lang in subtitles.keys():
843 sub = subtitles[sub_lang]
844 if sub is None:
845 continue
846 try:
847 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
848 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
849 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
850 else:
851 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
852 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
853 subfile.write(sub)
854 except (OSError, IOError):
855 self.report_error('Cannot write subtitles file ' + descfn)
856 return
857
858 if self.params.get('writeinfojson', False):
859 infofn = os.path.splitext(filename)[0] + '.info.json'
860 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
861 self.to_screen('[info] Video description metadata is already present')
862 else:
863 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
864 try:
865 write_json_file(info_dict, encodeFilename(infofn))
866 except (OSError, IOError):
867 self.report_error('Cannot write metadata to JSON file ' + infofn)
868 return
869
870 if self.params.get('writethumbnail', False):
871 if info_dict.get('thumbnail') is not None:
872 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
873 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
874 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
875 self.to_screen('[%s] %s: Thumbnail is already present' %
876 (info_dict['extractor'], info_dict['id']))
877 else:
878 self.to_screen('[%s] %s: Downloading thumbnail ...' %
879 (info_dict['extractor'], info_dict['id']))
880 try:
881 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
882 with open(thumb_filename, 'wb') as thumbf:
883 shutil.copyfileobj(uf, thumbf)
884 self.to_screen('[%s] %s: Writing thumbnail to: %s' %
885 (info_dict['extractor'], info_dict['id'], thumb_filename))
886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
887 self.report_warning('Unable to download thumbnail "%s": %s' %
888 (info_dict['thumbnail'], compat_str(err)))
889
890 if not self.params.get('skip_download', False):
891 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
892 success = True
893 else:
894 try:
895 def dl(name, info):
896 fd = get_suitable_downloader(info)(self, self.params)
897 for ph in self._progress_hooks:
898 fd.add_progress_hook(ph)
899 return fd.download(name, info)
900 if info_dict.get('requested_formats') is not None:
901 downloaded = []
902 success = True
903 for f in info_dict['requested_formats']:
904 new_info = dict(info_dict)
905 new_info.update(f)
906 fname = self.prepare_filename(new_info)
907 fname = prepend_extension(fname, 'f%s' % f['format_id'])
908 downloaded.append(fname)
909 partial_success = dl(fname, new_info)
910 success = success and partial_success
911 info_dict['__postprocessors'] = [FFmpegMergerPP(self)]
912 info_dict['__files_to_merge'] = downloaded
913 else:
914 # Just a single file
915 success = dl(filename, info_dict)
916 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
917 self.report_error('unable to download video data: %s' % str(err))
918 return
919 except (OSError, IOError) as err:
920 raise UnavailableVideoError(err)
921 except (ContentTooShortError, ) as err:
922 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
923 return
924
925 if success:
926 try:
927 self.post_process(filename, info_dict)
928 except (PostProcessingError) as err:
929 self.report_error('postprocessing: %s' % str(err))
930 return
931
932 self.record_download_archive(info_dict)
933
934 def download(self, url_list):
935 """Download a given list of URLs."""
936 if (len(url_list) > 1 and
937 '%' not in self.params['outtmpl']
938 and self.params.get('max_downloads') != 1):
939 raise SameFileError(self.params['outtmpl'])
940
941 for url in url_list:
942 try:
943 #It also downloads the videos
944 self.extract_info(url)
945 except UnavailableVideoError:
946 self.report_error('unable to download video')
947 except MaxDownloadsReached:
948 self.to_screen('[info] Maximum number of downloaded files reached.')
949 raise
950
951 return self._download_retcode
952
953 def download_with_info_file(self, info_filename):
954 with io.open(info_filename, 'r', encoding='utf-8') as f:
955 info = json.load(f)
956 try:
957 self.process_ie_result(info, download=True)
958 except DownloadError:
959 webpage_url = info.get('webpage_url')
960 if webpage_url is not None:
961 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
962 return self.download([webpage_url])
963 else:
964 raise
965 return self._download_retcode
966
967 def post_process(self, filename, ie_info):
968 """Run all the postprocessors on the given file."""
969 info = dict(ie_info)
970 info['filepath'] = filename
971 keep_video = None
972 pps_chain = []
973 if ie_info.get('__postprocessors') is not None:
974 pps_chain.extend(ie_info['__postprocessors'])
975 pps_chain.extend(self._pps)
976 for pp in pps_chain:
977 try:
978 keep_video_wish, new_info = pp.run(info)
979 if keep_video_wish is not None:
980 if keep_video_wish:
981 keep_video = keep_video_wish
982 elif keep_video is None:
983 # No clear decision yet, let IE decide
984 keep_video = keep_video_wish
985 except PostProcessingError as e:
986 self.report_error(e.msg)
987 if keep_video is False and not self.params.get('keepvideo', False):
988 try:
989 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
990 os.remove(encodeFilename(filename))
991 except (IOError, OSError):
992 self.report_warning('Unable to remove downloaded video file')
993
994 def _make_archive_id(self, info_dict):
995 # Future-proof against any change in case
996 # and backwards compatibility with prior versions
997 extractor = info_dict.get('extractor_key')
998 if extractor is None:
999 if 'id' in info_dict:
1000 extractor = info_dict.get('ie_key') # key in a playlist
1001 if extractor is None:
1002 return None # Incomplete video information
1003 return extractor.lower() + ' ' + info_dict['id']
1004
1005 def in_download_archive(self, info_dict):
1006 fn = self.params.get('download_archive')
1007 if fn is None:
1008 return False
1009
1010 vid_id = self._make_archive_id(info_dict)
1011 if vid_id is None:
1012 return False # Incomplete video information
1013
1014 try:
1015 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1016 for line in archive_file:
1017 if line.strip() == vid_id:
1018 return True
1019 except IOError as ioe:
1020 if ioe.errno != errno.ENOENT:
1021 raise
1022 return False
1023
1024 def record_download_archive(self, info_dict):
1025 fn = self.params.get('download_archive')
1026 if fn is None:
1027 return
1028 vid_id = self._make_archive_id(info_dict)
1029 assert vid_id
1030 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1031 archive_file.write(vid_id + '\n')
1032
1033 @staticmethod
1034 def format_resolution(format, default='unknown'):
1035 if format.get('vcodec') == 'none':
1036 return 'audio only'
1037 if format.get('resolution') is not None:
1038 return format['resolution']
1039 if format.get('height') is not None:
1040 if format.get('width') is not None:
1041 res = '%sx%s' % (format['width'], format['height'])
1042 else:
1043 res = '%sp' % format['height']
1044 elif format.get('width') is not None:
1045 res = '?x%d' % format['width']
1046 else:
1047 res = default
1048 return res
1049
1050 def list_formats(self, info_dict):
1051 def format_note(fdict):
1052 res = ''
1053 if fdict.get('ext') in ['f4f', 'f4m']:
1054 res += '(unsupported) '
1055 if fdict.get('format_note') is not None:
1056 res += fdict['format_note'] + ' '
1057 if fdict.get('tbr') is not None:
1058 res += '%4dk ' % fdict['tbr']
1059 if (fdict.get('vcodec') is not None and
1060 fdict.get('vcodec') != 'none'):
1061 res += '%-5s@' % fdict['vcodec']
1062 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1063 res += 'video@'
1064 if fdict.get('vbr') is not None:
1065 res += '%4dk' % fdict['vbr']
1066 if fdict.get('acodec') is not None:
1067 if res:
1068 res += ', '
1069 res += '%-5s' % fdict['acodec']
1070 elif fdict.get('abr') is not None:
1071 if res:
1072 res += ', '
1073 res += 'audio'
1074 if fdict.get('abr') is not None:
1075 res += '@%3dk' % fdict['abr']
1076 if fdict.get('filesize') is not None:
1077 if res:
1078 res += ', '
1079 res += format_bytes(fdict['filesize'])
1080 return res
1081
1082 def line(format, idlen=20):
1083 return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1084 format['format_id'],
1085 format['ext'],
1086 self.format_resolution(format),
1087 format_note(format),
1088 ))
1089
1090 formats = info_dict.get('formats', [info_dict])
1091 idlen = max(len('format code'),
1092 max(len(f['format_id']) for f in formats))
1093 formats_s = [line(f, idlen) for f in formats]
1094 if len(formats) > 1:
1095 formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1096 formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1097
1098 header_line = line({
1099 'format_id': 'format code', 'ext': 'extension',
1100 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1101 self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1102 (info_dict['id'], header_line, '\n'.join(formats_s)))
1103
1104 def urlopen(self, req):
1105 """ Start an HTTP download """
1106 return self._opener.open(req)
1107
1108 def print_debug_header(self):
1109 if not self.params.get('verbose'):
1110 return
1111 write_string('[debug] youtube-dl version ' + __version__ + '\n')
1112 try:
1113 sp = subprocess.Popen(
1114 ['git', 'rev-parse', '--short', 'HEAD'],
1115 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1116 cwd=os.path.dirname(os.path.abspath(__file__)))
1117 out, err = sp.communicate()
1118 out = out.decode().strip()
1119 if re.match('[0-9a-f]+', out):
1120 write_string('[debug] Git HEAD: ' + out + '\n')
1121 except:
1122 try:
1123 sys.exc_clear()
1124 except:
1125 pass
1126 write_string('[debug] Python version %s - %s' %
1127 (platform.python_version(), platform_name()) + '\n')
1128
1129 proxy_map = {}
1130 for handler in self._opener.handlers:
1131 if hasattr(handler, 'proxies'):
1132 proxy_map.update(handler.proxies)
1133 write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1134
1135 def _setup_opener(self):
1136 timeout_val = self.params.get('socket_timeout')
1137 timeout = 600 if timeout_val is None else float(timeout_val)
1138
1139 opts_cookiefile = self.params.get('cookiefile')
1140 opts_proxy = self.params.get('proxy')
1141
1142 if opts_cookiefile is None:
1143 self.cookiejar = compat_cookiejar.CookieJar()
1144 else:
1145 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1146 opts_cookiefile)
1147 if os.access(opts_cookiefile, os.R_OK):
1148 self.cookiejar.load()
1149
1150 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1151 self.cookiejar)
1152 if opts_proxy is not None:
1153 if opts_proxy == '':
1154 proxies = {}
1155 else:
1156 proxies = {'http': opts_proxy, 'https': opts_proxy}
1157 else:
1158 proxies = compat_urllib_request.getproxies()
1159 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1160 if 'http' in proxies and 'https' not in proxies:
1161 proxies['https'] = proxies['http']
1162 proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1163
1164 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1165 https_handler = make_HTTPS_handler(
1166 self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1167 ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1168 opener = compat_urllib_request.build_opener(
1169 https_handler, proxy_handler, cookie_processor, ydlh)
1170 # Delete the default user-agent header, which would otherwise apply in
1171 # cases where our custom HTTP handler doesn't come into play
1172 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1173 opener.addheaders = []
1174 self._opener = opener
1175
1176 # TODO remove this global modification
1177 compat_urllib_request.install_opener(opener)
1178 socket.setdefaulttimeout(timeout)