]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
[servingsys] Add support
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import errno
8 import io
9 import json
10 import os
11 import platform
12 import re
13 import shutil
14 import subprocess
15 import socket
16 import sys
17 import time
18 import traceback
19
20 if os.name == 'nt':
21 import ctypes
22
23 from .utils import (
24 compat_cookiejar,
25 compat_http_client,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_request,
29 ContentTooShortError,
30 date_from_str,
31 DateRange,
32 determine_ext,
33 DownloadError,
34 encodeFilename,
35 ExtractorError,
36 format_bytes,
37 formatSeconds,
38 get_term_width,
39 locked_file,
40 make_HTTPS_handler,
41 MaxDownloadsReached,
42 PostProcessingError,
43 platform_name,
44 preferredencoding,
45 SameFileError,
46 sanitize_filename,
47 subtitles_filename,
48 takewhile_inclusive,
49 UnavailableVideoError,
50 url_basename,
51 write_json_file,
52 write_string,
53 YoutubeDLHandler,
54 prepend_extension,
55 )
56 from .extractor import get_info_extractor, gen_extractors
57 from .downloader import get_suitable_downloader
58 from .postprocessor import FFmpegMergerPP
59 from .version import __version__
60
61
62 class YoutubeDL(object):
63 """YoutubeDL class.
64
65 YoutubeDL objects are the ones responsible of downloading the
66 actual video file and writing it to disk if the user has requested
67 it, among some other tasks. In most cases there should be one per
68 program. As, given a video URL, the downloader doesn't know how to
69 extract all the needed information, task that InfoExtractors do, it
70 has to pass the URL to one of them.
71
72 For this, YoutubeDL objects have a method that allows
73 InfoExtractors to be registered in a given order. When it is passed
74 a URL, the YoutubeDL object handles it to the first InfoExtractor it
75 finds that reports being able to handle it. The InfoExtractor extracts
76 all the information about the video or videos the URL refers to, and
77 YoutubeDL process the extracted information, possibly using a File
78 Downloader to download the video.
79
80 YoutubeDL objects accept a lot of parameters. In order not to saturate
81 the object constructor with arguments, it receives a dictionary of
82 options instead. These options are available through the params
83 attribute for the InfoExtractors to use. The YoutubeDL also
84 registers itself as the downloader in charge for the InfoExtractors
85 that are added to it, so this is a "mutual registration".
86
87 Available options:
88
89 username: Username for authentication purposes.
90 password: Password for authentication purposes.
91 videopassword: Password for acces a video.
92 usenetrc: Use netrc for authentication instead.
93 verbose: Print additional info to stdout.
94 quiet: Do not print messages to stdout.
95 forceurl: Force printing final URL.
96 forcetitle: Force printing title.
97 forceid: Force printing ID.
98 forcethumbnail: Force printing thumbnail URL.
99 forcedescription: Force printing description.
100 forcefilename: Force printing final filename.
101 forceduration: Force printing duration.
102 forcejson: Force printing info_dict as JSON.
103 simulate: Do not download the video files.
104 format: Video format code.
105 format_limit: Highest quality format to try.
106 outtmpl: Template for output names.
107 restrictfilenames: Do not allow "&" and spaces in file names
108 ignoreerrors: Do not stop on download errors.
109 nooverwrites: Prevent overwriting files.
110 playliststart: Playlist item to start at.
111 playlistend: Playlist item to end at.
112 matchtitle: Download only matching titles.
113 rejecttitle: Reject downloads for matching titles.
114 logger: Log messages to a logging.Logger instance.
115 logtostderr: Log messages to stderr instead of stdout.
116 writedescription: Write the video description to a .description file
117 writeinfojson: Write the video description to a .info.json file
118 writeannotations: Write the video annotations to a .annotations.xml file
119 writethumbnail: Write the thumbnail image to a file
120 writesubtitles: Write the video subtitles to a file
121 writeautomaticsub: Write the automatic subtitles to a file
122 allsubtitles: Downloads all the subtitles of the video
123 (requires writesubtitles or writeautomaticsub)
124 listsubtitles: Lists all available subtitles for the video
125 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
126 subtitleslangs: List of languages of the subtitles to download
127 keepvideo: Keep the video file after post-processing
128 daterange: A DateRange object, download only if the upload_date is in the range.
129 skip_download: Skip the actual download of the video file
130 cachedir: Location of the cache files in the filesystem.
131 None to disable filesystem cache.
132 noplaylist: Download single video instead of a playlist if in doubt.
133 age_limit: An integer representing the user's age in years.
134 Unsuitable videos for the given age are skipped.
135 min_views: An integer representing the minimum view count the video
136 must have in order to not be skipped.
137 Videos without view count information are always
138 downloaded. None for no limit.
139 max_views: An integer representing the maximum view count.
140 Videos that are more popular than that are not
141 downloaded.
142 Videos without view count information are always
143 downloaded. None for no limit.
144 download_archive: File name of a file where all downloads are recorded.
145 Videos already present in the file are not downloaded
146 again.
147 cookiefile: File name where cookies should be read from and dumped to.
148 nocheckcertificate:Do not verify SSL certificates
149 proxy: URL of the proxy server to use
150 socket_timeout: Time to wait for unresponsive hosts, in seconds
151 bidi_workaround: Work around buggy terminals without bidirectional text
152 support, using fridibi
153 debug_printtraffic:Print out sent and received HTTP traffic
154 include_ads: Download ads as well
155
156 The following parameters are not used by YoutubeDL itself, they are used by
157 the FileDownloader:
158 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
159 noresizebuffer, retries, continuedl, noprogress, consoletitle
160
161 The following options are used by the post processors:
162 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
163 otherwise prefer avconv.
164 """
165
166 params = None
167 _ies = []
168 _pps = []
169 _download_retcode = None
170 _num_downloads = None
171 _screen_file = None
172
173 def __init__(self, params=None):
174 """Create a FileDownloader object with the given options."""
175 if params is None:
176 params = {}
177 self._ies = []
178 self._ies_instances = {}
179 self._pps = []
180 self._progress_hooks = []
181 self._download_retcode = 0
182 self._num_downloads = 0
183 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
184 self._err_file = sys.stderr
185 self.params = params
186
187 if params.get('bidi_workaround', False):
188 try:
189 import pty
190 master, slave = pty.openpty()
191 width = get_term_width()
192 if width is None:
193 width_args = []
194 else:
195 width_args = ['-w', str(width)]
196 sp_kwargs = dict(
197 stdin=subprocess.PIPE,
198 stdout=slave,
199 stderr=self._err_file)
200 try:
201 self._output_process = subprocess.Popen(
202 ['bidiv'] + width_args, **sp_kwargs
203 )
204 except OSError:
205 self._output_process = subprocess.Popen(
206 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
207 self._output_channel = os.fdopen(master, 'rb')
208 except OSError as ose:
209 if ose.errno == 2:
210 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
211 else:
212 raise
213
214 if (sys.version_info >= (3,) and sys.platform != 'win32' and
215 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
216 and not params['restrictfilenames']):
217 # On Python 3, the Unicode filesystem API will throw errors (#1474)
218 self.report_warning(
219 'Assuming --restrict-filenames since file system encoding '
220 'cannot encode all charactes. '
221 'Set the LC_ALL environment variable to fix this.')
222 self.params['restrictfilenames'] = True
223
224 if '%(stitle)s' in self.params.get('outtmpl', ''):
225 self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
226
227 self._setup_opener()
228
229 def add_info_extractor(self, ie):
230 """Add an InfoExtractor object to the end of the list."""
231 self._ies.append(ie)
232 self._ies_instances[ie.ie_key()] = ie
233 ie.set_downloader(self)
234
235 def get_info_extractor(self, ie_key):
236 """
237 Get an instance of an IE with name ie_key, it will try to get one from
238 the _ies list, if there's no instance it will create a new one and add
239 it to the extractor list.
240 """
241 ie = self._ies_instances.get(ie_key)
242 if ie is None:
243 ie = get_info_extractor(ie_key)()
244 self.add_info_extractor(ie)
245 return ie
246
247 def add_default_info_extractors(self):
248 """
249 Add the InfoExtractors returned by gen_extractors to the end of the list
250 """
251 for ie in gen_extractors():
252 self.add_info_extractor(ie)
253
254 def add_post_processor(self, pp):
255 """Add a PostProcessor object to the end of the chain."""
256 self._pps.append(pp)
257 pp.set_downloader(self)
258
259 def add_progress_hook(self, ph):
260 """Add the progress hook (currently only for the file downloader)"""
261 self._progress_hooks.append(ph)
262
263 def _bidi_workaround(self, message):
264 if not hasattr(self, '_output_channel'):
265 return message
266
267 assert hasattr(self, '_output_process')
268 assert type(message) == type('')
269 line_count = message.count('\n') + 1
270 self._output_process.stdin.write((message + '\n').encode('utf-8'))
271 self._output_process.stdin.flush()
272 res = ''.join(self._output_channel.readline().decode('utf-8')
273 for _ in range(line_count))
274 return res[:-len('\n')]
275
276 def to_screen(self, message, skip_eol=False):
277 """Print message to stdout if not in quiet mode."""
278 return self.to_stdout(message, skip_eol, check_quiet=True)
279
280 def to_stdout(self, message, skip_eol=False, check_quiet=False):
281 """Print message to stdout if not in quiet mode."""
282 if self.params.get('logger'):
283 self.params['logger'].debug(message)
284 elif not check_quiet or not self.params.get('quiet', False):
285 message = self._bidi_workaround(message)
286 terminator = ['\n', ''][skip_eol]
287 output = message + terminator
288
289 write_string(output, self._screen_file)
290
291 def to_stderr(self, message):
292 """Print message to stderr."""
293 assert type(message) == type('')
294 if self.params.get('logger'):
295 self.params['logger'].error(message)
296 else:
297 message = self._bidi_workaround(message)
298 output = message + '\n'
299 write_string(output, self._err_file)
300
301 def to_console_title(self, message):
302 if not self.params.get('consoletitle', False):
303 return
304 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
305 # c_wchar_p() might not be necessary if `message` is
306 # already of type unicode()
307 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
308 elif 'TERM' in os.environ:
309 write_string('\033]0;%s\007' % message, self._screen_file)
310
311 def save_console_title(self):
312 if not self.params.get('consoletitle', False):
313 return
314 if 'TERM' in os.environ:
315 # Save the title on stack
316 write_string('\033[22;0t', self._screen_file)
317
318 def restore_console_title(self):
319 if not self.params.get('consoletitle', False):
320 return
321 if 'TERM' in os.environ:
322 # Restore the title from stack
323 write_string('\033[23;0t', self._screen_file)
324
325 def __enter__(self):
326 self.save_console_title()
327 return self
328
329 def __exit__(self, *args):
330 self.restore_console_title()
331
332 if self.params.get('cookiefile') is not None:
333 self.cookiejar.save()
334
335 def trouble(self, message=None, tb=None):
336 """Determine action to take when a download problem appears.
337
338 Depending on if the downloader has been configured to ignore
339 download errors or not, this method may throw an exception or
340 not when errors are found, after printing the message.
341
342 tb, if given, is additional traceback information.
343 """
344 if message is not None:
345 self.to_stderr(message)
346 if self.params.get('verbose'):
347 if tb is None:
348 if sys.exc_info()[0]: # if .trouble has been called from an except block
349 tb = ''
350 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
351 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
352 tb += compat_str(traceback.format_exc())
353 else:
354 tb_data = traceback.format_list(traceback.extract_stack())
355 tb = ''.join(tb_data)
356 self.to_stderr(tb)
357 if not self.params.get('ignoreerrors', False):
358 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
359 exc_info = sys.exc_info()[1].exc_info
360 else:
361 exc_info = sys.exc_info()
362 raise DownloadError(message, exc_info)
363 self._download_retcode = 1
364
365 def report_warning(self, message):
366 '''
367 Print the message to stderr, it will be prefixed with 'WARNING:'
368 If stderr is a tty file the 'WARNING:' will be colored
369 '''
370 if self._err_file.isatty() and os.name != 'nt':
371 _msg_header = '\033[0;33mWARNING:\033[0m'
372 else:
373 _msg_header = 'WARNING:'
374 warning_message = '%s %s' % (_msg_header, message)
375 self.to_stderr(warning_message)
376
377 def report_error(self, message, tb=None):
378 '''
379 Do the same as trouble, but prefixes the message with 'ERROR:', colored
380 in red if stderr is a tty file.
381 '''
382 if self._err_file.isatty() and os.name != 'nt':
383 _msg_header = '\033[0;31mERROR:\033[0m'
384 else:
385 _msg_header = 'ERROR:'
386 error_message = '%s %s' % (_msg_header, message)
387 self.trouble(error_message, tb)
388
389 def report_file_already_downloaded(self, file_name):
390 """Report file has already been fully downloaded."""
391 try:
392 self.to_screen('[download] %s has already been downloaded' % file_name)
393 except UnicodeEncodeError:
394 self.to_screen('[download] The file has already been downloaded')
395
396 def increment_downloads(self):
397 """Increment the ordinal that assigns a number to each file."""
398 self._num_downloads += 1
399
400 def prepare_filename(self, info_dict):
401 """Generate the output filename."""
402 try:
403 template_dict = dict(info_dict)
404
405 template_dict['epoch'] = int(time.time())
406 autonumber_size = self.params.get('autonumber_size')
407 if autonumber_size is None:
408 autonumber_size = 5
409 autonumber_templ = '%0' + str(autonumber_size) + 'd'
410 template_dict['autonumber'] = autonumber_templ % self._num_downloads
411 if template_dict.get('playlist_index') is not None:
412 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
413
414 sanitize = lambda k, v: sanitize_filename(
415 compat_str(v),
416 restricted=self.params.get('restrictfilenames'),
417 is_id=(k == 'id'))
418 template_dict = dict((k, sanitize(k, v))
419 for k, v in template_dict.items()
420 if v is not None)
421 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
422
423 tmpl = os.path.expanduser(self.params['outtmpl'])
424 filename = tmpl % template_dict
425 return filename
426 except ValueError as err:
427 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
428 return None
429
430 def _match_entry(self, info_dict):
431 """ Returns None iff the file should be downloaded """
432
433 video_title = info_dict.get('title', info_dict.get('id', 'video'))
434 if 'title' in info_dict:
435 # This can happen when we're just evaluating the playlist
436 title = info_dict['title']
437 matchtitle = self.params.get('matchtitle', False)
438 if matchtitle:
439 if not re.search(matchtitle, title, re.IGNORECASE):
440 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
441 rejecttitle = self.params.get('rejecttitle', False)
442 if rejecttitle:
443 if re.search(rejecttitle, title, re.IGNORECASE):
444 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
445 date = info_dict.get('upload_date', None)
446 if date is not None:
447 dateRange = self.params.get('daterange', DateRange())
448 if date not in dateRange:
449 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
450 view_count = info_dict.get('view_count', None)
451 if view_count is not None:
452 min_views = self.params.get('min_views')
453 if min_views is not None and view_count < min_views:
454 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
455 max_views = self.params.get('max_views')
456 if max_views is not None and view_count > max_views:
457 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
458 age_limit = self.params.get('age_limit')
459 if age_limit is not None:
460 if age_limit < info_dict.get('age_limit', 0):
461 return 'Skipping "' + title + '" because it is age restricted'
462 if self.in_download_archive(info_dict):
463 return '%s has already been recorded in archive' % video_title
464 return None
465
466 @staticmethod
467 def add_extra_info(info_dict, extra_info):
468 '''Set the keys from extra_info in info dict if they are missing'''
469 for key, value in extra_info.items():
470 info_dict.setdefault(key, value)
471
472 def extract_info(self, url, download=True, ie_key=None, extra_info={},
473 process=True):
474 '''
475 Returns a list with a dictionary for each video we find.
476 If 'download', also downloads the videos.
477 extra_info is a dict containing the extra values to add to each result
478 '''
479
480 if ie_key:
481 ies = [self.get_info_extractor(ie_key)]
482 else:
483 ies = self._ies
484
485 for ie in ies:
486 if not ie.suitable(url):
487 continue
488
489 if not ie.working():
490 self.report_warning('The program functionality for this site has been marked as broken, '
491 'and will probably not work.')
492
493 try:
494 ie_result = ie.extract(url)
495 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
496 break
497 if isinstance(ie_result, list):
498 # Backwards compatibility: old IE result format
499 ie_result = {
500 '_type': 'compat_list',
501 'entries': ie_result,
502 }
503 self.add_extra_info(ie_result,
504 {
505 'extractor': ie.IE_NAME,
506 'webpage_url': url,
507 'webpage_url_basename': url_basename(url),
508 'extractor_key': ie.ie_key(),
509 })
510 if process:
511 return self.process_ie_result(ie_result, download, extra_info)
512 else:
513 return ie_result
514 except ExtractorError as de: # An error we somewhat expected
515 self.report_error(compat_str(de), de.format_traceback())
516 break
517 except Exception as e:
518 if self.params.get('ignoreerrors', False):
519 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
520 break
521 else:
522 raise
523 else:
524 self.report_error('no suitable InfoExtractor: %s' % url)
525
526 def process_ie_result(self, ie_result, download=True, extra_info={}):
527 """
528 Take the result of the ie(may be modified) and resolve all unresolved
529 references (URLs, playlist items).
530
531 It will also download the videos if 'download'.
532 Returns the resolved ie_result.
533 """
534
535 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
536 if result_type == 'video':
537 self.add_extra_info(ie_result, extra_info)
538 return self.process_video_result(ie_result, download=download)
539 elif result_type == 'url':
540 # We have to add extra_info to the results because it may be
541 # contained in a playlist
542 return self.extract_info(ie_result['url'],
543 download,
544 ie_key=ie_result.get('ie_key'),
545 extra_info=extra_info)
546 elif result_type == 'url_transparent':
547 # Use the information from the embedding page
548 info = self.extract_info(
549 ie_result['url'], ie_key=ie_result.get('ie_key'),
550 extra_info=extra_info, download=False, process=False)
551
552 def make_result(embedded_info):
553 new_result = ie_result.copy()
554 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
555 'entries', 'ie_key', 'duration',
556 'subtitles', 'annotations', 'format',
557 'thumbnail', 'thumbnails'):
558 if f in new_result:
559 del new_result[f]
560 if f in embedded_info:
561 new_result[f] = embedded_info[f]
562 return new_result
563 new_result = make_result(info)
564
565 assert new_result.get('_type') != 'url_transparent'
566 if new_result.get('_type') == 'compat_list':
567 new_result['entries'] = [
568 make_result(e) for e in new_result['entries']]
569
570 return self.process_ie_result(
571 new_result, download=download, extra_info=extra_info)
572 elif result_type == 'playlist':
573 # We process each entry in the playlist
574 playlist = ie_result.get('title', None) or ie_result.get('id', None)
575 self.to_screen('[download] Downloading playlist: %s' % playlist)
576
577 playlist_results = []
578
579 n_all_entries = len(ie_result['entries'])
580 playliststart = self.params.get('playliststart', 1) - 1
581 playlistend = self.params.get('playlistend', None)
582 # For backwards compatibility, interpret -1 as whole list
583 if playlistend == -1:
584 playlistend = None
585
586 entries = ie_result['entries'][playliststart:playlistend]
587 n_entries = len(entries)
588
589 self.to_screen(
590 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
591 (ie_result['extractor'], playlist, n_all_entries, n_entries))
592
593 for i, entry in enumerate(entries, 1):
594 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
595 extra = {
596 'playlist': playlist,
597 'playlist_index': i + playliststart,
598 'extractor': ie_result['extractor'],
599 'webpage_url': ie_result['webpage_url'],
600 'webpage_url_basename': url_basename(ie_result['webpage_url']),
601 'extractor_key': ie_result['extractor_key'],
602 }
603
604 reason = self._match_entry(entry)
605 if reason is not None:
606 self.to_screen('[download] ' + reason)
607 continue
608
609 entry_result = self.process_ie_result(entry,
610 download=download,
611 extra_info=extra)
612 playlist_results.append(entry_result)
613 ie_result['entries'] = playlist_results
614 return ie_result
615 elif result_type == 'compat_list':
616 def _fixup(r):
617 self.add_extra_info(r,
618 {
619 'extractor': ie_result['extractor'],
620 'webpage_url': ie_result['webpage_url'],
621 'webpage_url_basename': url_basename(ie_result['webpage_url']),
622 'extractor_key': ie_result['extractor_key'],
623 })
624 return r
625 ie_result['entries'] = [
626 self.process_ie_result(_fixup(r), download, extra_info)
627 for r in ie_result['entries']
628 ]
629 return ie_result
630 else:
631 raise Exception('Invalid result type: %s' % result_type)
632
633 def select_format(self, format_spec, available_formats):
634 if format_spec == 'best' or format_spec is None:
635 return available_formats[-1]
636 elif format_spec == 'worst':
637 return available_formats[0]
638 else:
639 extensions = ['mp4', 'flv', 'webm', '3gp']
640 if format_spec in extensions:
641 filter_f = lambda f: f['ext'] == format_spec
642 else:
643 filter_f = lambda f: f['format_id'] == format_spec
644 matches = list(filter(filter_f, available_formats))
645 if matches:
646 return matches[-1]
647 return None
648
649 def process_video_result(self, info_dict, download=True):
650 assert info_dict.get('_type', 'video') == 'video'
651
652 if 'playlist' not in info_dict:
653 # It isn't part of a playlist
654 info_dict['playlist'] = None
655 info_dict['playlist_index'] = None
656
657 # This extractors handle format selection themselves
658 if info_dict['extractor'] in ['Youku']:
659 if download:
660 self.process_info(info_dict)
661 return info_dict
662
663 # We now pick which formats have to be downloaded
664 if info_dict.get('formats') is None:
665 # There's only one format available
666 formats = [info_dict]
667 else:
668 formats = info_dict['formats']
669
670 # We check that all the formats have the format and format_id fields
671 for (i, format) in enumerate(formats):
672 if format.get('format_id') is None:
673 format['format_id'] = compat_str(i)
674 if format.get('format') is None:
675 format['format'] = '{id} - {res}{note}'.format(
676 id=format['format_id'],
677 res=self.format_resolution(format),
678 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
679 )
680 # Automatically determine file extension if missing
681 if 'ext' not in format:
682 format['ext'] = determine_ext(format['url'])
683
684 format_limit = self.params.get('format_limit', None)
685 if format_limit:
686 formats = list(takewhile_inclusive(
687 lambda f: f['format_id'] != format_limit, formats
688 ))
689
690 # TODO Central sorting goes here
691
692 if formats[0] is not info_dict:
693 # only set the 'formats' fields if the original info_dict list them
694 # otherwise we end up with a circular reference, the first (and unique)
695 # element in the 'formats' field in info_dict is info_dict itself,
696 # wich can't be exported to json
697 info_dict['formats'] = formats
698 if self.params.get('listformats', None):
699 self.list_formats(info_dict)
700 return
701
702 req_format = self.params.get('format', 'best')
703 if req_format is None:
704 req_format = 'best'
705 formats_to_download = []
706 # The -1 is for supporting YoutubeIE
707 if req_format in ('-1', 'all'):
708 formats_to_download = formats
709 else:
710 # We can accept formats requested in the format: 34/5/best, we pick
711 # the first that is available, starting from left
712 req_formats = req_format.split('/')
713 for rf in req_formats:
714 if re.match(r'.+?\+.+?', rf) is not None:
715 # Two formats have been requested like '137+139'
716 format_1, format_2 = rf.split('+')
717 formats_info = (self.select_format(format_1, formats),
718 self.select_format(format_2, formats))
719 if all(formats_info):
720 selected_format = {
721 'requested_formats': formats_info,
722 'format': rf,
723 'ext': formats_info[0]['ext'],
724 }
725 else:
726 selected_format = None
727 else:
728 selected_format = self.select_format(rf, formats)
729 if selected_format is not None:
730 formats_to_download = [selected_format]
731 break
732 if not formats_to_download:
733 raise ExtractorError('requested format not available',
734 expected=True)
735
736 if download:
737 if len(formats_to_download) > 1:
738 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
739 for format in formats_to_download:
740 new_info = dict(info_dict)
741 new_info.update(format)
742 self.process_info(new_info)
743 # We update the info dict with the best quality format (backwards compatibility)
744 info_dict.update(formats_to_download[-1])
745 return info_dict
746
747 def process_info(self, info_dict):
748 """Process a single resolved IE result."""
749
750 assert info_dict.get('_type', 'video') == 'video'
751 #We increment the download the download count here to match the previous behaviour.
752 self.increment_downloads()
753
754 info_dict['fulltitle'] = info_dict['title']
755 if len(info_dict['title']) > 200:
756 info_dict['title'] = info_dict['title'][:197] + '...'
757
758 # Keep for backwards compatibility
759 info_dict['stitle'] = info_dict['title']
760
761 if not 'format' in info_dict:
762 info_dict['format'] = info_dict['ext']
763
764 reason = self._match_entry(info_dict)
765 if reason is not None:
766 self.to_screen('[download] ' + reason)
767 return
768
769 max_downloads = self.params.get('max_downloads')
770 if max_downloads is not None:
771 if self._num_downloads > int(max_downloads):
772 raise MaxDownloadsReached()
773
774 filename = self.prepare_filename(info_dict)
775
776 # Forced printings
777 if self.params.get('forcetitle', False):
778 self.to_stdout(info_dict['fulltitle'])
779 if self.params.get('forceid', False):
780 self.to_stdout(info_dict['id'])
781 if self.params.get('forceurl', False):
782 # For RTMP URLs, also include the playpath
783 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
784 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
785 self.to_stdout(info_dict['thumbnail'])
786 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
787 self.to_stdout(info_dict['description'])
788 if self.params.get('forcefilename', False) and filename is not None:
789 self.to_stdout(filename)
790 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
791 self.to_stdout(formatSeconds(info_dict['duration']))
792 if self.params.get('forceformat', False):
793 self.to_stdout(info_dict['format'])
794 if self.params.get('forcejson', False):
795 info_dict['_filename'] = filename
796 self.to_stdout(json.dumps(info_dict))
797
798 # Do nothing else if in simulate mode
799 if self.params.get('simulate', False):
800 return
801
802 if filename is None:
803 return
804
805 try:
806 dn = os.path.dirname(encodeFilename(filename))
807 if dn != '' and not os.path.exists(dn):
808 os.makedirs(dn)
809 except (OSError, IOError) as err:
810 self.report_error('unable to create directory ' + compat_str(err))
811 return
812
813 if self.params.get('writedescription', False):
814 descfn = filename + '.description'
815 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
816 self.to_screen('[info] Video description is already present')
817 else:
818 try:
819 self.to_screen('[info] Writing video description to: ' + descfn)
820 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
821 descfile.write(info_dict['description'])
822 except (KeyError, TypeError):
823 self.report_warning('There\'s no description to write.')
824 except (OSError, IOError):
825 self.report_error('Cannot write description file ' + descfn)
826 return
827
828 if self.params.get('writeannotations', False):
829 annofn = filename + '.annotations.xml'
830 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
831 self.to_screen('[info] Video annotations are already present')
832 else:
833 try:
834 self.to_screen('[info] Writing video annotations to: ' + annofn)
835 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
836 annofile.write(info_dict['annotations'])
837 except (KeyError, TypeError):
838 self.report_warning('There are no annotations to write.')
839 except (OSError, IOError):
840 self.report_error('Cannot write annotations file: ' + annofn)
841 return
842
843 subtitles_are_requested = any([self.params.get('writesubtitles', False),
844 self.params.get('writeautomaticsub')])
845
846 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
847 # subtitles download errors are already managed as troubles in relevant IE
848 # that way it will silently go on when used with unsupporting IE
849 subtitles = info_dict['subtitles']
850 sub_format = self.params.get('subtitlesformat', 'srt')
851 for sub_lang in subtitles.keys():
852 sub = subtitles[sub_lang]
853 if sub is None:
854 continue
855 try:
856 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
857 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
858 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
859 else:
860 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
861 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
862 subfile.write(sub)
863 except (OSError, IOError):
864 self.report_error('Cannot write subtitles file ' + descfn)
865 return
866
867 if self.params.get('writeinfojson', False):
868 infofn = os.path.splitext(filename)[0] + '.info.json'
869 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
870 self.to_screen('[info] Video description metadata is already present')
871 else:
872 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
873 try:
874 write_json_file(info_dict, encodeFilename(infofn))
875 except (OSError, IOError):
876 self.report_error('Cannot write metadata to JSON file ' + infofn)
877 return
878
879 if self.params.get('writethumbnail', False):
880 if info_dict.get('thumbnail') is not None:
881 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
882 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
883 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
884 self.to_screen('[%s] %s: Thumbnail is already present' %
885 (info_dict['extractor'], info_dict['id']))
886 else:
887 self.to_screen('[%s] %s: Downloading thumbnail ...' %
888 (info_dict['extractor'], info_dict['id']))
889 try:
890 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
891 with open(thumb_filename, 'wb') as thumbf:
892 shutil.copyfileobj(uf, thumbf)
893 self.to_screen('[%s] %s: Writing thumbnail to: %s' %
894 (info_dict['extractor'], info_dict['id'], thumb_filename))
895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
896 self.report_warning('Unable to download thumbnail "%s": %s' %
897 (info_dict['thumbnail'], compat_str(err)))
898
899 if not self.params.get('skip_download', False):
900 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
901 success = True
902 else:
903 try:
904 def dl(name, info):
905 fd = get_suitable_downloader(info)(self, self.params)
906 for ph in self._progress_hooks:
907 fd.add_progress_hook(ph)
908 return fd.download(name, info)
909 if info_dict.get('requested_formats') is not None:
910 downloaded = []
911 success = True
912 merger = FFmpegMergerPP(self)
913 if not merger._get_executable():
914 postprocessors = []
915 self.report_warning('You have requested multiple '
916 'formats but ffmpeg or avconv are not installed.'
917 ' The formats won\'t be merged')
918 else:
919 postprocessors = [merger]
920 for f in info_dict['requested_formats']:
921 new_info = dict(info_dict)
922 new_info.update(f)
923 fname = self.prepare_filename(new_info)
924 fname = prepend_extension(fname, 'f%s' % f['format_id'])
925 downloaded.append(fname)
926 partial_success = dl(fname, new_info)
927 success = success and partial_success
928 info_dict['__postprocessors'] = postprocessors
929 info_dict['__files_to_merge'] = downloaded
930 else:
931 # Just a single file
932 success = dl(filename, info_dict)
933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
934 self.report_error('unable to download video data: %s' % str(err))
935 return
936 except (OSError, IOError) as err:
937 raise UnavailableVideoError(err)
938 except (ContentTooShortError, ) as err:
939 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
940 return
941
942 if success:
943 try:
944 self.post_process(filename, info_dict)
945 except (PostProcessingError) as err:
946 self.report_error('postprocessing: %s' % str(err))
947 return
948
949 self.record_download_archive(info_dict)
950
951 def download(self, url_list):
952 """Download a given list of URLs."""
953 if (len(url_list) > 1 and
954 '%' not in self.params['outtmpl']
955 and self.params.get('max_downloads') != 1):
956 raise SameFileError(self.params['outtmpl'])
957
958 for url in url_list:
959 try:
960 #It also downloads the videos
961 self.extract_info(url)
962 except UnavailableVideoError:
963 self.report_error('unable to download video')
964 except MaxDownloadsReached:
965 self.to_screen('[info] Maximum number of downloaded files reached.')
966 raise
967
968 return self._download_retcode
969
970 def download_with_info_file(self, info_filename):
971 with io.open(info_filename, 'r', encoding='utf-8') as f:
972 info = json.load(f)
973 try:
974 self.process_ie_result(info, download=True)
975 except DownloadError:
976 webpage_url = info.get('webpage_url')
977 if webpage_url is not None:
978 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
979 return self.download([webpage_url])
980 else:
981 raise
982 return self._download_retcode
983
984 def post_process(self, filename, ie_info):
985 """Run all the postprocessors on the given file."""
986 info = dict(ie_info)
987 info['filepath'] = filename
988 keep_video = None
989 pps_chain = []
990 if ie_info.get('__postprocessors') is not None:
991 pps_chain.extend(ie_info['__postprocessors'])
992 pps_chain.extend(self._pps)
993 for pp in pps_chain:
994 try:
995 keep_video_wish, new_info = pp.run(info)
996 if keep_video_wish is not None:
997 if keep_video_wish:
998 keep_video = keep_video_wish
999 elif keep_video is None:
1000 # No clear decision yet, let IE decide
1001 keep_video = keep_video_wish
1002 except PostProcessingError as e:
1003 self.report_error(e.msg)
1004 if keep_video is False and not self.params.get('keepvideo', False):
1005 try:
1006 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1007 os.remove(encodeFilename(filename))
1008 except (IOError, OSError):
1009 self.report_warning('Unable to remove downloaded video file')
1010
1011 def _make_archive_id(self, info_dict):
1012 # Future-proof against any change in case
1013 # and backwards compatibility with prior versions
1014 extractor = info_dict.get('extractor_key')
1015 if extractor is None:
1016 if 'id' in info_dict:
1017 extractor = info_dict.get('ie_key') # key in a playlist
1018 if extractor is None:
1019 return None # Incomplete video information
1020 return extractor.lower() + ' ' + info_dict['id']
1021
1022 def in_download_archive(self, info_dict):
1023 fn = self.params.get('download_archive')
1024 if fn is None:
1025 return False
1026
1027 vid_id = self._make_archive_id(info_dict)
1028 if vid_id is None:
1029 return False # Incomplete video information
1030
1031 try:
1032 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1033 for line in archive_file:
1034 if line.strip() == vid_id:
1035 return True
1036 except IOError as ioe:
1037 if ioe.errno != errno.ENOENT:
1038 raise
1039 return False
1040
1041 def record_download_archive(self, info_dict):
1042 fn = self.params.get('download_archive')
1043 if fn is None:
1044 return
1045 vid_id = self._make_archive_id(info_dict)
1046 assert vid_id
1047 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1048 archive_file.write(vid_id + '\n')
1049
1050 @staticmethod
1051 def format_resolution(format, default='unknown'):
1052 if format.get('vcodec') == 'none':
1053 return 'audio only'
1054 if format.get('resolution') is not None:
1055 return format['resolution']
1056 if format.get('height') is not None:
1057 if format.get('width') is not None:
1058 res = '%sx%s' % (format['width'], format['height'])
1059 else:
1060 res = '%sp' % format['height']
1061 elif format.get('width') is not None:
1062 res = '?x%d' % format['width']
1063 else:
1064 res = default
1065 return res
1066
1067 def list_formats(self, info_dict):
1068 def format_note(fdict):
1069 res = ''
1070 if fdict.get('ext') in ['f4f', 'f4m']:
1071 res += '(unsupported) '
1072 if fdict.get('format_note') is not None:
1073 res += fdict['format_note'] + ' '
1074 if fdict.get('tbr') is not None:
1075 res += '%4dk ' % fdict['tbr']
1076 if (fdict.get('vcodec') is not None and
1077 fdict.get('vcodec') != 'none'):
1078 res += '%-5s' % fdict['vcodec']
1079 if fdict.get('vbr') is not None:
1080 res += '@'
1081 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1082 res += 'video@'
1083 if fdict.get('vbr') is not None:
1084 res += '%4dk' % fdict['vbr']
1085 if fdict.get('acodec') is not None:
1086 if res:
1087 res += ', '
1088 res += '%-5s' % fdict['acodec']
1089 elif fdict.get('abr') is not None:
1090 if res:
1091 res += ', '
1092 res += 'audio'
1093 if fdict.get('abr') is not None:
1094 res += '@%3dk' % fdict['abr']
1095 if fdict.get('filesize') is not None:
1096 if res:
1097 res += ', '
1098 res += format_bytes(fdict['filesize'])
1099 return res
1100
1101 def line(format, idlen=20):
1102 return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1103 format['format_id'],
1104 format['ext'],
1105 self.format_resolution(format),
1106 format_note(format),
1107 ))
1108
1109 formats = info_dict.get('formats', [info_dict])
1110 idlen = max(len('format code'),
1111 max(len(f['format_id']) for f in formats))
1112 formats_s = [line(f, idlen) for f in formats]
1113 if len(formats) > 1:
1114 formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1115 formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1116
1117 header_line = line({
1118 'format_id': 'format code', 'ext': 'extension',
1119 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1120 self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1121 (info_dict['id'], header_line, '\n'.join(formats_s)))
1122
1123 def urlopen(self, req):
1124 """ Start an HTTP download """
1125 return self._opener.open(req)
1126
1127 def print_debug_header(self):
1128 if not self.params.get('verbose'):
1129 return
1130 write_string('[debug] youtube-dl version ' + __version__ + '\n')
1131 try:
1132 sp = subprocess.Popen(
1133 ['git', 'rev-parse', '--short', 'HEAD'],
1134 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1135 cwd=os.path.dirname(os.path.abspath(__file__)))
1136 out, err = sp.communicate()
1137 out = out.decode().strip()
1138 if re.match('[0-9a-f]+', out):
1139 write_string('[debug] Git HEAD: ' + out + '\n')
1140 except:
1141 try:
1142 sys.exc_clear()
1143 except:
1144 pass
1145 write_string('[debug] Python version %s - %s' %
1146 (platform.python_version(), platform_name()) + '\n')
1147
1148 proxy_map = {}
1149 for handler in self._opener.handlers:
1150 if hasattr(handler, 'proxies'):
1151 proxy_map.update(handler.proxies)
1152 write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1153
1154 def _setup_opener(self):
1155 timeout_val = self.params.get('socket_timeout')
1156 timeout = 600 if timeout_val is None else float(timeout_val)
1157
1158 opts_cookiefile = self.params.get('cookiefile')
1159 opts_proxy = self.params.get('proxy')
1160
1161 if opts_cookiefile is None:
1162 self.cookiejar = compat_cookiejar.CookieJar()
1163 else:
1164 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1165 opts_cookiefile)
1166 if os.access(opts_cookiefile, os.R_OK):
1167 self.cookiejar.load()
1168
1169 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1170 self.cookiejar)
1171 if opts_proxy is not None:
1172 if opts_proxy == '':
1173 proxies = {}
1174 else:
1175 proxies = {'http': opts_proxy, 'https': opts_proxy}
1176 else:
1177 proxies = compat_urllib_request.getproxies()
1178 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1179 if 'http' in proxies and 'https' not in proxies:
1180 proxies['https'] = proxies['http']
1181 proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1182
1183 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1184 https_handler = make_HTTPS_handler(
1185 self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1186 ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1187 opener = compat_urllib_request.build_opener(
1188 https_handler, proxy_handler, cookie_processor, ydlh)
1189 # Delete the default user-agent header, which would otherwise apply in
1190 # cases where our custom HTTP handler doesn't come into play
1191 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1192 opener.addheaders = []
1193 self._opener = opener
1194
1195 # TODO remove this global modification
1196 compat_urllib_request.install_opener(opener)
1197 socket.setdefaulttimeout(timeout)