]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
Don't install the global url opener
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import errno
8 import io
9 import json
10 import os
11 import platform
12 import re
13 import shutil
14 import subprocess
15 import socket
16 import sys
17 import time
18 import traceback
19
20 if os.name == 'nt':
21 import ctypes
22
23 from .utils import (
24 compat_cookiejar,
25 compat_http_client,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_request,
29 ContentTooShortError,
30 date_from_str,
31 DateRange,
32 determine_ext,
33 DownloadError,
34 encodeFilename,
35 ExtractorError,
36 format_bytes,
37 formatSeconds,
38 get_term_width,
39 locked_file,
40 make_HTTPS_handler,
41 MaxDownloadsReached,
42 PagedList,
43 PostProcessingError,
44 platform_name,
45 preferredencoding,
46 SameFileError,
47 sanitize_filename,
48 subtitles_filename,
49 takewhile_inclusive,
50 UnavailableVideoError,
51 url_basename,
52 write_json_file,
53 write_string,
54 YoutubeDLHandler,
55 prepend_extension,
56 )
57 from .extractor import get_info_extractor, gen_extractors
58 from .downloader import get_suitable_downloader
59 from .postprocessor import FFmpegMergerPP
60 from .version import __version__
61
62
63 class YoutubeDL(object):
64 """YoutubeDL class.
65
66 YoutubeDL objects are the ones responsible of downloading the
67 actual video file and writing it to disk if the user has requested
68 it, among some other tasks. In most cases there should be one per
69 program. As, given a video URL, the downloader doesn't know how to
70 extract all the needed information, task that InfoExtractors do, it
71 has to pass the URL to one of them.
72
73 For this, YoutubeDL objects have a method that allows
74 InfoExtractors to be registered in a given order. When it is passed
75 a URL, the YoutubeDL object handles it to the first InfoExtractor it
76 finds that reports being able to handle it. The InfoExtractor extracts
77 all the information about the video or videos the URL refers to, and
78 YoutubeDL process the extracted information, possibly using a File
79 Downloader to download the video.
80
81 YoutubeDL objects accept a lot of parameters. In order not to saturate
82 the object constructor with arguments, it receives a dictionary of
83 options instead. These options are available through the params
84 attribute for the InfoExtractors to use. The YoutubeDL also
85 registers itself as the downloader in charge for the InfoExtractors
86 that are added to it, so this is a "mutual registration".
87
88 Available options:
89
90 username: Username for authentication purposes.
91 password: Password for authentication purposes.
92 videopassword: Password for acces a video.
93 usenetrc: Use netrc for authentication instead.
94 verbose: Print additional info to stdout.
95 quiet: Do not print messages to stdout.
96 forceurl: Force printing final URL.
97 forcetitle: Force printing title.
98 forceid: Force printing ID.
99 forcethumbnail: Force printing thumbnail URL.
100 forcedescription: Force printing description.
101 forcefilename: Force printing final filename.
102 forceduration: Force printing duration.
103 forcejson: Force printing info_dict as JSON.
104 simulate: Do not download the video files.
105 format: Video format code.
106 format_limit: Highest quality format to try.
107 outtmpl: Template for output names.
108 restrictfilenames: Do not allow "&" and spaces in file names
109 ignoreerrors: Do not stop on download errors.
110 nooverwrites: Prevent overwriting files.
111 playliststart: Playlist item to start at.
112 playlistend: Playlist item to end at.
113 matchtitle: Download only matching titles.
114 rejecttitle: Reject downloads for matching titles.
115 logger: Log messages to a logging.Logger instance.
116 logtostderr: Log messages to stderr instead of stdout.
117 writedescription: Write the video description to a .description file
118 writeinfojson: Write the video description to a .info.json file
119 writeannotations: Write the video annotations to a .annotations.xml file
120 writethumbnail: Write the thumbnail image to a file
121 writesubtitles: Write the video subtitles to a file
122 writeautomaticsub: Write the automatic subtitles to a file
123 allsubtitles: Downloads all the subtitles of the video
124 (requires writesubtitles or writeautomaticsub)
125 listsubtitles: Lists all available subtitles for the video
126 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
127 subtitleslangs: List of languages of the subtitles to download
128 keepvideo: Keep the video file after post-processing
129 daterange: A DateRange object, download only if the upload_date is in the range.
130 skip_download: Skip the actual download of the video file
131 cachedir: Location of the cache files in the filesystem.
132 None to disable filesystem cache.
133 noplaylist: Download single video instead of a playlist if in doubt.
134 age_limit: An integer representing the user's age in years.
135 Unsuitable videos for the given age are skipped.
136 min_views: An integer representing the minimum view count the video
137 must have in order to not be skipped.
138 Videos without view count information are always
139 downloaded. None for no limit.
140 max_views: An integer representing the maximum view count.
141 Videos that are more popular than that are not
142 downloaded.
143 Videos without view count information are always
144 downloaded. None for no limit.
145 download_archive: File name of a file where all downloads are recorded.
146 Videos already present in the file are not downloaded
147 again.
148 cookiefile: File name where cookies should be read from and dumped to.
149 nocheckcertificate:Do not verify SSL certificates
150 proxy: URL of the proxy server to use
151 socket_timeout: Time to wait for unresponsive hosts, in seconds
152 bidi_workaround: Work around buggy terminals without bidirectional text
153 support, using fridibi
154 debug_printtraffic:Print out sent and received HTTP traffic
155 include_ads: Download ads as well
156 default_search: Prepend this string if an input url is not valid.
157 'auto' for elaborate guessing
158
159 The following parameters are not used by YoutubeDL itself, they are used by
160 the FileDownloader:
161 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
162 noresizebuffer, retries, continuedl, noprogress, consoletitle
163
164 The following options are used by the post processors:
165 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
166 otherwise prefer avconv.
167 """
168
169 params = None
170 _ies = []
171 _pps = []
172 _download_retcode = None
173 _num_downloads = None
174 _screen_file = None
175
176 def __init__(self, params=None):
177 """Create a FileDownloader object with the given options."""
178 if params is None:
179 params = {}
180 self._ies = []
181 self._ies_instances = {}
182 self._pps = []
183 self._progress_hooks = []
184 self._download_retcode = 0
185 self._num_downloads = 0
186 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
187 self._err_file = sys.stderr
188 self.params = params
189
190 if params.get('bidi_workaround', False):
191 try:
192 import pty
193 master, slave = pty.openpty()
194 width = get_term_width()
195 if width is None:
196 width_args = []
197 else:
198 width_args = ['-w', str(width)]
199 sp_kwargs = dict(
200 stdin=subprocess.PIPE,
201 stdout=slave,
202 stderr=self._err_file)
203 try:
204 self._output_process = subprocess.Popen(
205 ['bidiv'] + width_args, **sp_kwargs
206 )
207 except OSError:
208 self._output_process = subprocess.Popen(
209 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
210 self._output_channel = os.fdopen(master, 'rb')
211 except OSError as ose:
212 if ose.errno == 2:
213 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
214 else:
215 raise
216
217 if (sys.version_info >= (3,) and sys.platform != 'win32' and
218 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
219 and not params['restrictfilenames']):
220 # On Python 3, the Unicode filesystem API will throw errors (#1474)
221 self.report_warning(
222 'Assuming --restrict-filenames since file system encoding '
223 'cannot encode all charactes. '
224 'Set the LC_ALL environment variable to fix this.')
225 self.params['restrictfilenames'] = True
226
227 if '%(stitle)s' in self.params.get('outtmpl', ''):
228 self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
229
230 self._setup_opener()
231
232 def add_info_extractor(self, ie):
233 """Add an InfoExtractor object to the end of the list."""
234 self._ies.append(ie)
235 self._ies_instances[ie.ie_key()] = ie
236 ie.set_downloader(self)
237
238 def get_info_extractor(self, ie_key):
239 """
240 Get an instance of an IE with name ie_key, it will try to get one from
241 the _ies list, if there's no instance it will create a new one and add
242 it to the extractor list.
243 """
244 ie = self._ies_instances.get(ie_key)
245 if ie is None:
246 ie = get_info_extractor(ie_key)()
247 self.add_info_extractor(ie)
248 return ie
249
250 def add_default_info_extractors(self):
251 """
252 Add the InfoExtractors returned by gen_extractors to the end of the list
253 """
254 for ie in gen_extractors():
255 self.add_info_extractor(ie)
256
257 def add_post_processor(self, pp):
258 """Add a PostProcessor object to the end of the chain."""
259 self._pps.append(pp)
260 pp.set_downloader(self)
261
262 def add_progress_hook(self, ph):
263 """Add the progress hook (currently only for the file downloader)"""
264 self._progress_hooks.append(ph)
265
266 def _bidi_workaround(self, message):
267 if not hasattr(self, '_output_channel'):
268 return message
269
270 assert hasattr(self, '_output_process')
271 assert type(message) == type('')
272 line_count = message.count('\n') + 1
273 self._output_process.stdin.write((message + '\n').encode('utf-8'))
274 self._output_process.stdin.flush()
275 res = ''.join(self._output_channel.readline().decode('utf-8')
276 for _ in range(line_count))
277 return res[:-len('\n')]
278
279 def to_screen(self, message, skip_eol=False):
280 """Print message to stdout if not in quiet mode."""
281 return self.to_stdout(message, skip_eol, check_quiet=True)
282
283 def to_stdout(self, message, skip_eol=False, check_quiet=False):
284 """Print message to stdout if not in quiet mode."""
285 if self.params.get('logger'):
286 self.params['logger'].debug(message)
287 elif not check_quiet or not self.params.get('quiet', False):
288 message = self._bidi_workaround(message)
289 terminator = ['\n', ''][skip_eol]
290 output = message + terminator
291
292 write_string(output, self._screen_file)
293
294 def to_stderr(self, message):
295 """Print message to stderr."""
296 assert type(message) == type('')
297 if self.params.get('logger'):
298 self.params['logger'].error(message)
299 else:
300 message = self._bidi_workaround(message)
301 output = message + '\n'
302 write_string(output, self._err_file)
303
304 def to_console_title(self, message):
305 if not self.params.get('consoletitle', False):
306 return
307 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
308 # c_wchar_p() might not be necessary if `message` is
309 # already of type unicode()
310 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
311 elif 'TERM' in os.environ:
312 write_string('\033]0;%s\007' % message, self._screen_file)
313
314 def save_console_title(self):
315 if not self.params.get('consoletitle', False):
316 return
317 if 'TERM' in os.environ:
318 # Save the title on stack
319 write_string('\033[22;0t', self._screen_file)
320
321 def restore_console_title(self):
322 if not self.params.get('consoletitle', False):
323 return
324 if 'TERM' in os.environ:
325 # Restore the title from stack
326 write_string('\033[23;0t', self._screen_file)
327
328 def __enter__(self):
329 self.save_console_title()
330 return self
331
332 def __exit__(self, *args):
333 self.restore_console_title()
334
335 if self.params.get('cookiefile') is not None:
336 self.cookiejar.save()
337
338 def trouble(self, message=None, tb=None):
339 """Determine action to take when a download problem appears.
340
341 Depending on if the downloader has been configured to ignore
342 download errors or not, this method may throw an exception or
343 not when errors are found, after printing the message.
344
345 tb, if given, is additional traceback information.
346 """
347 if message is not None:
348 self.to_stderr(message)
349 if self.params.get('verbose'):
350 if tb is None:
351 if sys.exc_info()[0]: # if .trouble has been called from an except block
352 tb = ''
353 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
354 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
355 tb += compat_str(traceback.format_exc())
356 else:
357 tb_data = traceback.format_list(traceback.extract_stack())
358 tb = ''.join(tb_data)
359 self.to_stderr(tb)
360 if not self.params.get('ignoreerrors', False):
361 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
362 exc_info = sys.exc_info()[1].exc_info
363 else:
364 exc_info = sys.exc_info()
365 raise DownloadError(message, exc_info)
366 self._download_retcode = 1
367
368 def report_warning(self, message):
369 '''
370 Print the message to stderr, it will be prefixed with 'WARNING:'
371 If stderr is a tty file the 'WARNING:' will be colored
372 '''
373 if self.params.get('logger') is not None:
374 self.params['logger'].warning(message)
375 else:
376 if self._err_file.isatty() and os.name != 'nt':
377 _msg_header = '\033[0;33mWARNING:\033[0m'
378 else:
379 _msg_header = 'WARNING:'
380 warning_message = '%s %s' % (_msg_header, message)
381 self.to_stderr(warning_message)
382
383 def report_error(self, message, tb=None):
384 '''
385 Do the same as trouble, but prefixes the message with 'ERROR:', colored
386 in red if stderr is a tty file.
387 '''
388 if self._err_file.isatty() and os.name != 'nt':
389 _msg_header = '\033[0;31mERROR:\033[0m'
390 else:
391 _msg_header = 'ERROR:'
392 error_message = '%s %s' % (_msg_header, message)
393 self.trouble(error_message, tb)
394
395 def report_file_already_downloaded(self, file_name):
396 """Report file has already been fully downloaded."""
397 try:
398 self.to_screen('[download] %s has already been downloaded' % file_name)
399 except UnicodeEncodeError:
400 self.to_screen('[download] The file has already been downloaded')
401
402 def prepare_filename(self, info_dict):
403 """Generate the output filename."""
404 try:
405 template_dict = dict(info_dict)
406
407 template_dict['epoch'] = int(time.time())
408 autonumber_size = self.params.get('autonumber_size')
409 if autonumber_size is None:
410 autonumber_size = 5
411 autonumber_templ = '%0' + str(autonumber_size) + 'd'
412 template_dict['autonumber'] = autonumber_templ % self._num_downloads
413 if template_dict.get('playlist_index') is not None:
414 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
415 if template_dict.get('resolution') is None:
416 if template_dict.get('width') and template_dict.get('height'):
417 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
418 elif template_dict.get('height'):
419 template_dict['resolution'] = '%sp' % template_dict['height']
420 elif template_dict.get('width'):
421 template_dict['resolution'] = '?x%d' % template_dict['width']
422
423 sanitize = lambda k, v: sanitize_filename(
424 compat_str(v),
425 restricted=self.params.get('restrictfilenames'),
426 is_id=(k == 'id'))
427 template_dict = dict((k, sanitize(k, v))
428 for k, v in template_dict.items()
429 if v is not None)
430 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
431
432 tmpl = os.path.expanduser(self.params['outtmpl'])
433 filename = tmpl % template_dict
434 return filename
435 except ValueError as err:
436 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
437 return None
438
439 def _match_entry(self, info_dict):
440 """ Returns None iff the file should be downloaded """
441
442 video_title = info_dict.get('title', info_dict.get('id', 'video'))
443 if 'title' in info_dict:
444 # This can happen when we're just evaluating the playlist
445 title = info_dict['title']
446 matchtitle = self.params.get('matchtitle', False)
447 if matchtitle:
448 if not re.search(matchtitle, title, re.IGNORECASE):
449 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
450 rejecttitle = self.params.get('rejecttitle', False)
451 if rejecttitle:
452 if re.search(rejecttitle, title, re.IGNORECASE):
453 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
454 date = info_dict.get('upload_date', None)
455 if date is not None:
456 dateRange = self.params.get('daterange', DateRange())
457 if date not in dateRange:
458 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
459 view_count = info_dict.get('view_count', None)
460 if view_count is not None:
461 min_views = self.params.get('min_views')
462 if min_views is not None and view_count < min_views:
463 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
464 max_views = self.params.get('max_views')
465 if max_views is not None and view_count > max_views:
466 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
467 age_limit = self.params.get('age_limit')
468 if age_limit is not None:
469 if age_limit < info_dict.get('age_limit', 0):
470 return 'Skipping "' + title + '" because it is age restricted'
471 if self.in_download_archive(info_dict):
472 return '%s has already been recorded in archive' % video_title
473 return None
474
475 @staticmethod
476 def add_extra_info(info_dict, extra_info):
477 '''Set the keys from extra_info in info dict if they are missing'''
478 for key, value in extra_info.items():
479 info_dict.setdefault(key, value)
480
481 def extract_info(self, url, download=True, ie_key=None, extra_info={},
482 process=True):
483 '''
484 Returns a list with a dictionary for each video we find.
485 If 'download', also downloads the videos.
486 extra_info is a dict containing the extra values to add to each result
487 '''
488
489 if ie_key:
490 ies = [self.get_info_extractor(ie_key)]
491 else:
492 ies = self._ies
493
494 for ie in ies:
495 if not ie.suitable(url):
496 continue
497
498 if not ie.working():
499 self.report_warning('The program functionality for this site has been marked as broken, '
500 'and will probably not work.')
501
502 try:
503 ie_result = ie.extract(url)
504 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
505 break
506 if isinstance(ie_result, list):
507 # Backwards compatibility: old IE result format
508 ie_result = {
509 '_type': 'compat_list',
510 'entries': ie_result,
511 }
512 self.add_extra_info(ie_result,
513 {
514 'extractor': ie.IE_NAME,
515 'webpage_url': url,
516 'webpage_url_basename': url_basename(url),
517 'extractor_key': ie.ie_key(),
518 })
519 if process:
520 return self.process_ie_result(ie_result, download, extra_info)
521 else:
522 return ie_result
523 except ExtractorError as de: # An error we somewhat expected
524 self.report_error(compat_str(de), de.format_traceback())
525 break
526 except MaxDownloadsReached:
527 raise
528 except Exception as e:
529 if self.params.get('ignoreerrors', False):
530 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
531 break
532 else:
533 raise
534 else:
535 self.report_error('no suitable InfoExtractor: %s' % url)
536
537 def process_ie_result(self, ie_result, download=True, extra_info={}):
538 """
539 Take the result of the ie(may be modified) and resolve all unresolved
540 references (URLs, playlist items).
541
542 It will also download the videos if 'download'.
543 Returns the resolved ie_result.
544 """
545
546 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
547 if result_type == 'video':
548 self.add_extra_info(ie_result, extra_info)
549 return self.process_video_result(ie_result, download=download)
550 elif result_type == 'url':
551 # We have to add extra_info to the results because it may be
552 # contained in a playlist
553 return self.extract_info(ie_result['url'],
554 download,
555 ie_key=ie_result.get('ie_key'),
556 extra_info=extra_info)
557 elif result_type == 'url_transparent':
558 # Use the information from the embedding page
559 info = self.extract_info(
560 ie_result['url'], ie_key=ie_result.get('ie_key'),
561 extra_info=extra_info, download=False, process=False)
562
563 def make_result(embedded_info):
564 new_result = ie_result.copy()
565 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
566 'entries', 'ie_key', 'duration',
567 'subtitles', 'annotations', 'format',
568 'thumbnail', 'thumbnails'):
569 if f in new_result:
570 del new_result[f]
571 if f in embedded_info:
572 new_result[f] = embedded_info[f]
573 return new_result
574 new_result = make_result(info)
575
576 assert new_result.get('_type') != 'url_transparent'
577 if new_result.get('_type') == 'compat_list':
578 new_result['entries'] = [
579 make_result(e) for e in new_result['entries']]
580
581 return self.process_ie_result(
582 new_result, download=download, extra_info=extra_info)
583 elif result_type == 'playlist':
584 # We process each entry in the playlist
585 playlist = ie_result.get('title', None) or ie_result.get('id', None)
586 self.to_screen('[download] Downloading playlist: %s' % playlist)
587
588 playlist_results = []
589
590 playliststart = self.params.get('playliststart', 1) - 1
591 playlistend = self.params.get('playlistend', None)
592 # For backwards compatibility, interpret -1 as whole list
593 if playlistend == -1:
594 playlistend = None
595
596 if isinstance(ie_result['entries'], list):
597 n_all_entries = len(ie_result['entries'])
598 entries = ie_result['entries'][playliststart:playlistend]
599 n_entries = len(entries)
600 self.to_screen(
601 "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
602 (ie_result['extractor'], playlist, n_all_entries, n_entries))
603 else:
604 assert isinstance(ie_result['entries'], PagedList)
605 entries = ie_result['entries'].getslice(
606 playliststart, playlistend)
607 n_entries = len(entries)
608 self.to_screen(
609 "[%s] playlist %s: Downloading %d videos" %
610 (ie_result['extractor'], playlist, n_entries))
611
612 for i, entry in enumerate(entries, 1):
613 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
614 extra = {
615 'playlist': playlist,
616 'playlist_index': i + playliststart,
617 'extractor': ie_result['extractor'],
618 'webpage_url': ie_result['webpage_url'],
619 'webpage_url_basename': url_basename(ie_result['webpage_url']),
620 'extractor_key': ie_result['extractor_key'],
621 }
622
623 reason = self._match_entry(entry)
624 if reason is not None:
625 self.to_screen('[download] ' + reason)
626 continue
627
628 entry_result = self.process_ie_result(entry,
629 download=download,
630 extra_info=extra)
631 playlist_results.append(entry_result)
632 ie_result['entries'] = playlist_results
633 return ie_result
634 elif result_type == 'compat_list':
635 def _fixup(r):
636 self.add_extra_info(r,
637 {
638 'extractor': ie_result['extractor'],
639 'webpage_url': ie_result['webpage_url'],
640 'webpage_url_basename': url_basename(ie_result['webpage_url']),
641 'extractor_key': ie_result['extractor_key'],
642 })
643 return r
644 ie_result['entries'] = [
645 self.process_ie_result(_fixup(r), download, extra_info)
646 for r in ie_result['entries']
647 ]
648 return ie_result
649 else:
650 raise Exception('Invalid result type: %s' % result_type)
651
652 def select_format(self, format_spec, available_formats):
653 if format_spec == 'best' or format_spec is None:
654 return available_formats[-1]
655 elif format_spec == 'worst':
656 return available_formats[0]
657 elif format_spec == 'bestaudio':
658 audio_formats = [
659 f for f in available_formats
660 if f.get('vcodec') == 'none']
661 if audio_formats:
662 return audio_formats[-1]
663 elif format_spec == 'worstaudio':
664 audio_formats = [
665 f for f in available_formats
666 if f.get('vcodec') == 'none']
667 if audio_formats:
668 return audio_formats[0]
669 else:
670 extensions = ['mp4', 'flv', 'webm', '3gp']
671 if format_spec in extensions:
672 filter_f = lambda f: f['ext'] == format_spec
673 else:
674 filter_f = lambda f: f['format_id'] == format_spec
675 matches = list(filter(filter_f, available_formats))
676 if matches:
677 return matches[-1]
678 return None
679
680 def process_video_result(self, info_dict, download=True):
681 assert info_dict.get('_type', 'video') == 'video'
682
683 if 'playlist' not in info_dict:
684 # It isn't part of a playlist
685 info_dict['playlist'] = None
686 info_dict['playlist_index'] = None
687
688 if 'display_id' not in info_dict and 'id' in info_dict:
689 info_dict['display_id'] = info_dict['id']
690
691 # This extractors handle format selection themselves
692 if info_dict['extractor'] in ['Youku']:
693 if download:
694 self.process_info(info_dict)
695 return info_dict
696
697 # We now pick which formats have to be downloaded
698 if info_dict.get('formats') is None:
699 # There's only one format available
700 formats = [info_dict]
701 else:
702 formats = info_dict['formats']
703
704 # We check that all the formats have the format and format_id fields
705 for (i, format) in enumerate(formats):
706 if format.get('format_id') is None:
707 format['format_id'] = compat_str(i)
708 if format.get('format') is None:
709 format['format'] = '{id} - {res}{note}'.format(
710 id=format['format_id'],
711 res=self.format_resolution(format),
712 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
713 )
714 # Automatically determine file extension if missing
715 if 'ext' not in format:
716 format['ext'] = determine_ext(format['url'])
717
718 format_limit = self.params.get('format_limit', None)
719 if format_limit:
720 formats = list(takewhile_inclusive(
721 lambda f: f['format_id'] != format_limit, formats
722 ))
723
724 # TODO Central sorting goes here
725
726 if formats[0] is not info_dict:
727 # only set the 'formats' fields if the original info_dict list them
728 # otherwise we end up with a circular reference, the first (and unique)
729 # element in the 'formats' field in info_dict is info_dict itself,
730 # wich can't be exported to json
731 info_dict['formats'] = formats
732 if self.params.get('listformats', None):
733 self.list_formats(info_dict)
734 return
735
736 req_format = self.params.get('format')
737 if req_format is None:
738 req_format = 'best'
739 formats_to_download = []
740 # The -1 is for supporting YoutubeIE
741 if req_format in ('-1', 'all'):
742 formats_to_download = formats
743 else:
744 # We can accept formats requested in the format: 34/5/best, we pick
745 # the first that is available, starting from left
746 req_formats = req_format.split('/')
747 for rf in req_formats:
748 if re.match(r'.+?\+.+?', rf) is not None:
749 # Two formats have been requested like '137+139'
750 format_1, format_2 = rf.split('+')
751 formats_info = (self.select_format(format_1, formats),
752 self.select_format(format_2, formats))
753 if all(formats_info):
754 selected_format = {
755 'requested_formats': formats_info,
756 'format': rf,
757 'ext': formats_info[0]['ext'],
758 }
759 else:
760 selected_format = None
761 else:
762 selected_format = self.select_format(rf, formats)
763 if selected_format is not None:
764 formats_to_download = [selected_format]
765 break
766 if not formats_to_download:
767 raise ExtractorError('requested format not available',
768 expected=True)
769
770 if download:
771 if len(formats_to_download) > 1:
772 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
773 for format in formats_to_download:
774 new_info = dict(info_dict)
775 new_info.update(format)
776 self.process_info(new_info)
777 # We update the info dict with the best quality format (backwards compatibility)
778 info_dict.update(formats_to_download[-1])
779 return info_dict
780
781 def process_info(self, info_dict):
782 """Process a single resolved IE result."""
783
784 assert info_dict.get('_type', 'video') == 'video'
785
786 max_downloads = self.params.get('max_downloads')
787 if max_downloads is not None:
788 if self._num_downloads >= int(max_downloads):
789 raise MaxDownloadsReached()
790
791 info_dict['fulltitle'] = info_dict['title']
792 if len(info_dict['title']) > 200:
793 info_dict['title'] = info_dict['title'][:197] + '...'
794
795 # Keep for backwards compatibility
796 info_dict['stitle'] = info_dict['title']
797
798 if not 'format' in info_dict:
799 info_dict['format'] = info_dict['ext']
800
801 reason = self._match_entry(info_dict)
802 if reason is not None:
803 self.to_screen('[download] ' + reason)
804 return
805
806 self._num_downloads += 1
807
808 filename = self.prepare_filename(info_dict)
809
810 # Forced printings
811 if self.params.get('forcetitle', False):
812 self.to_stdout(info_dict['fulltitle'])
813 if self.params.get('forceid', False):
814 self.to_stdout(info_dict['id'])
815 if self.params.get('forceurl', False):
816 # For RTMP URLs, also include the playpath
817 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
818 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
819 self.to_stdout(info_dict['thumbnail'])
820 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
821 self.to_stdout(info_dict['description'])
822 if self.params.get('forcefilename', False) and filename is not None:
823 self.to_stdout(filename)
824 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
825 self.to_stdout(formatSeconds(info_dict['duration']))
826 if self.params.get('forceformat', False):
827 self.to_stdout(info_dict['format'])
828 if self.params.get('forcejson', False):
829 info_dict['_filename'] = filename
830 self.to_stdout(json.dumps(info_dict))
831
832 # Do nothing else if in simulate mode
833 if self.params.get('simulate', False):
834 return
835
836 if filename is None:
837 return
838
839 try:
840 dn = os.path.dirname(encodeFilename(filename))
841 if dn != '' and not os.path.exists(dn):
842 os.makedirs(dn)
843 except (OSError, IOError) as err:
844 self.report_error('unable to create directory ' + compat_str(err))
845 return
846
847 if self.params.get('writedescription', False):
848 descfn = filename + '.description'
849 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
850 self.to_screen('[info] Video description is already present')
851 else:
852 try:
853 self.to_screen('[info] Writing video description to: ' + descfn)
854 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
855 descfile.write(info_dict['description'])
856 except (KeyError, TypeError):
857 self.report_warning('There\'s no description to write.')
858 except (OSError, IOError):
859 self.report_error('Cannot write description file ' + descfn)
860 return
861
862 if self.params.get('writeannotations', False):
863 annofn = filename + '.annotations.xml'
864 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
865 self.to_screen('[info] Video annotations are already present')
866 else:
867 try:
868 self.to_screen('[info] Writing video annotations to: ' + annofn)
869 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
870 annofile.write(info_dict['annotations'])
871 except (KeyError, TypeError):
872 self.report_warning('There are no annotations to write.')
873 except (OSError, IOError):
874 self.report_error('Cannot write annotations file: ' + annofn)
875 return
876
877 subtitles_are_requested = any([self.params.get('writesubtitles', False),
878 self.params.get('writeautomaticsub')])
879
880 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
881 # subtitles download errors are already managed as troubles in relevant IE
882 # that way it will silently go on when used with unsupporting IE
883 subtitles = info_dict['subtitles']
884 sub_format = self.params.get('subtitlesformat', 'srt')
885 for sub_lang in subtitles.keys():
886 sub = subtitles[sub_lang]
887 if sub is None:
888 continue
889 try:
890 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
891 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
892 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
893 else:
894 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
895 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
896 subfile.write(sub)
897 except (OSError, IOError):
898 self.report_error('Cannot write subtitles file ' + descfn)
899 return
900
901 if self.params.get('writeinfojson', False):
902 infofn = os.path.splitext(filename)[0] + '.info.json'
903 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
904 self.to_screen('[info] Video description metadata is already present')
905 else:
906 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
907 try:
908 write_json_file(info_dict, encodeFilename(infofn))
909 except (OSError, IOError):
910 self.report_error('Cannot write metadata to JSON file ' + infofn)
911 return
912
913 if self.params.get('writethumbnail', False):
914 if info_dict.get('thumbnail') is not None:
915 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
916 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
917 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
918 self.to_screen('[%s] %s: Thumbnail is already present' %
919 (info_dict['extractor'], info_dict['id']))
920 else:
921 self.to_screen('[%s] %s: Downloading thumbnail ...' %
922 (info_dict['extractor'], info_dict['id']))
923 try:
924 uf = self.urlopen(info_dict['thumbnail'])
925 with open(thumb_filename, 'wb') as thumbf:
926 shutil.copyfileobj(uf, thumbf)
927 self.to_screen('[%s] %s: Writing thumbnail to: %s' %
928 (info_dict['extractor'], info_dict['id'], thumb_filename))
929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
930 self.report_warning('Unable to download thumbnail "%s": %s' %
931 (info_dict['thumbnail'], compat_str(err)))
932
933 if not self.params.get('skip_download', False):
934 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
935 success = True
936 else:
937 try:
938 def dl(name, info):
939 fd = get_suitable_downloader(info)(self, self.params)
940 for ph in self._progress_hooks:
941 fd.add_progress_hook(ph)
942 return fd.download(name, info)
943 if info_dict.get('requested_formats') is not None:
944 downloaded = []
945 success = True
946 merger = FFmpegMergerPP(self)
947 if not merger._get_executable():
948 postprocessors = []
949 self.report_warning('You have requested multiple '
950 'formats but ffmpeg or avconv are not installed.'
951 ' The formats won\'t be merged')
952 else:
953 postprocessors = [merger]
954 for f in info_dict['requested_formats']:
955 new_info = dict(info_dict)
956 new_info.update(f)
957 fname = self.prepare_filename(new_info)
958 fname = prepend_extension(fname, 'f%s' % f['format_id'])
959 downloaded.append(fname)
960 partial_success = dl(fname, new_info)
961 success = success and partial_success
962 info_dict['__postprocessors'] = postprocessors
963 info_dict['__files_to_merge'] = downloaded
964 else:
965 # Just a single file
966 success = dl(filename, info_dict)
967 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
968 self.report_error('unable to download video data: %s' % str(err))
969 return
970 except (OSError, IOError) as err:
971 raise UnavailableVideoError(err)
972 except (ContentTooShortError, ) as err:
973 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
974 return
975
976 if success:
977 try:
978 self.post_process(filename, info_dict)
979 except (PostProcessingError) as err:
980 self.report_error('postprocessing: %s' % str(err))
981 return
982
983 self.record_download_archive(info_dict)
984
985 def download(self, url_list):
986 """Download a given list of URLs."""
987 if (len(url_list) > 1 and
988 '%' not in self.params['outtmpl']
989 and self.params.get('max_downloads') != 1):
990 raise SameFileError(self.params['outtmpl'])
991
992 for url in url_list:
993 try:
994 #It also downloads the videos
995 self.extract_info(url)
996 except UnavailableVideoError:
997 self.report_error('unable to download video')
998 except MaxDownloadsReached:
999 self.to_screen('[info] Maximum number of downloaded files reached.')
1000 raise
1001
1002 return self._download_retcode
1003
1004 def download_with_info_file(self, info_filename):
1005 with io.open(info_filename, 'r', encoding='utf-8') as f:
1006 info = json.load(f)
1007 try:
1008 self.process_ie_result(info, download=True)
1009 except DownloadError:
1010 webpage_url = info.get('webpage_url')
1011 if webpage_url is not None:
1012 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1013 return self.download([webpage_url])
1014 else:
1015 raise
1016 return self._download_retcode
1017
1018 def post_process(self, filename, ie_info):
1019 """Run all the postprocessors on the given file."""
1020 info = dict(ie_info)
1021 info['filepath'] = filename
1022 keep_video = None
1023 pps_chain = []
1024 if ie_info.get('__postprocessors') is not None:
1025 pps_chain.extend(ie_info['__postprocessors'])
1026 pps_chain.extend(self._pps)
1027 for pp in pps_chain:
1028 try:
1029 keep_video_wish, new_info = pp.run(info)
1030 if keep_video_wish is not None:
1031 if keep_video_wish:
1032 keep_video = keep_video_wish
1033 elif keep_video is None:
1034 # No clear decision yet, let IE decide
1035 keep_video = keep_video_wish
1036 except PostProcessingError as e:
1037 self.report_error(e.msg)
1038 if keep_video is False and not self.params.get('keepvideo', False):
1039 try:
1040 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1041 os.remove(encodeFilename(filename))
1042 except (IOError, OSError):
1043 self.report_warning('Unable to remove downloaded video file')
1044
1045 def _make_archive_id(self, info_dict):
1046 # Future-proof against any change in case
1047 # and backwards compatibility with prior versions
1048 extractor = info_dict.get('extractor_key')
1049 if extractor is None:
1050 if 'id' in info_dict:
1051 extractor = info_dict.get('ie_key') # key in a playlist
1052 if extractor is None:
1053 return None # Incomplete video information
1054 return extractor.lower() + ' ' + info_dict['id']
1055
1056 def in_download_archive(self, info_dict):
1057 fn = self.params.get('download_archive')
1058 if fn is None:
1059 return False
1060
1061 vid_id = self._make_archive_id(info_dict)
1062 if vid_id is None:
1063 return False # Incomplete video information
1064
1065 try:
1066 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1067 for line in archive_file:
1068 if line.strip() == vid_id:
1069 return True
1070 except IOError as ioe:
1071 if ioe.errno != errno.ENOENT:
1072 raise
1073 return False
1074
1075 def record_download_archive(self, info_dict):
1076 fn = self.params.get('download_archive')
1077 if fn is None:
1078 return
1079 vid_id = self._make_archive_id(info_dict)
1080 assert vid_id
1081 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1082 archive_file.write(vid_id + '\n')
1083
1084 @staticmethod
1085 def format_resolution(format, default='unknown'):
1086 if format.get('vcodec') == 'none':
1087 return 'audio only'
1088 if format.get('resolution') is not None:
1089 return format['resolution']
1090 if format.get('height') is not None:
1091 if format.get('width') is not None:
1092 res = '%sx%s' % (format['width'], format['height'])
1093 else:
1094 res = '%sp' % format['height']
1095 elif format.get('width') is not None:
1096 res = '?x%d' % format['width']
1097 else:
1098 res = default
1099 return res
1100
1101 def list_formats(self, info_dict):
1102 def format_note(fdict):
1103 res = ''
1104 if fdict.get('ext') in ['f4f', 'f4m']:
1105 res += '(unsupported) '
1106 if fdict.get('format_note') is not None:
1107 res += fdict['format_note'] + ' '
1108 if fdict.get('tbr') is not None:
1109 res += '%4dk ' % fdict['tbr']
1110 if fdict.get('container') is not None:
1111 if res:
1112 res += ', '
1113 res += '%s container' % fdict['container']
1114 if (fdict.get('vcodec') is not None and
1115 fdict.get('vcodec') != 'none'):
1116 if res:
1117 res += ', '
1118 res += fdict['vcodec']
1119 if fdict.get('vbr') is not None:
1120 res += '@'
1121 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1122 res += 'video@'
1123 if fdict.get('vbr') is not None:
1124 res += '%4dk' % fdict['vbr']
1125 if fdict.get('acodec') is not None:
1126 if res:
1127 res += ', '
1128 if fdict['acodec'] == 'none':
1129 res += 'video only'
1130 else:
1131 res += '%-5s' % fdict['acodec']
1132 elif fdict.get('abr') is not None:
1133 if res:
1134 res += ', '
1135 res += 'audio'
1136 if fdict.get('abr') is not None:
1137 res += '@%3dk' % fdict['abr']
1138 if fdict.get('asr') is not None:
1139 res += ' (%5dHz)' % fdict['asr']
1140 if fdict.get('filesize') is not None:
1141 if res:
1142 res += ', '
1143 res += format_bytes(fdict['filesize'])
1144 return res
1145
1146 def line(format, idlen=20):
1147 return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1148 format['format_id'],
1149 format['ext'],
1150 self.format_resolution(format),
1151 format_note(format),
1152 ))
1153
1154 formats = info_dict.get('formats', [info_dict])
1155 idlen = max(len('format code'),
1156 max(len(f['format_id']) for f in formats))
1157 formats_s = [line(f, idlen) for f in formats]
1158 if len(formats) > 1:
1159 formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1160 formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1161
1162 header_line = line({
1163 'format_id': 'format code', 'ext': 'extension',
1164 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1165 self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1166 (info_dict['id'], header_line, '\n'.join(formats_s)))
1167
1168 def urlopen(self, req):
1169 """ Start an HTTP download """
1170 return self._opener.open(req, timeout=self._socket_timeout)
1171
1172 def print_debug_header(self):
1173 if not self.params.get('verbose'):
1174 return
1175 write_string('[debug] youtube-dl version ' + __version__ + '\n')
1176 try:
1177 sp = subprocess.Popen(
1178 ['git', 'rev-parse', '--short', 'HEAD'],
1179 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1180 cwd=os.path.dirname(os.path.abspath(__file__)))
1181 out, err = sp.communicate()
1182 out = out.decode().strip()
1183 if re.match('[0-9a-f]+', out):
1184 write_string('[debug] Git HEAD: ' + out + '\n')
1185 except:
1186 try:
1187 sys.exc_clear()
1188 except:
1189 pass
1190 write_string('[debug] Python version %s - %s' %
1191 (platform.python_version(), platform_name()) + '\n')
1192
1193 proxy_map = {}
1194 for handler in self._opener.handlers:
1195 if hasattr(handler, 'proxies'):
1196 proxy_map.update(handler.proxies)
1197 write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1198
1199 def _setup_opener(self):
1200 timeout_val = self.params.get('socket_timeout')
1201 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
1202
1203 opts_cookiefile = self.params.get('cookiefile')
1204 opts_proxy = self.params.get('proxy')
1205
1206 if opts_cookiefile is None:
1207 self.cookiejar = compat_cookiejar.CookieJar()
1208 else:
1209 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1210 opts_cookiefile)
1211 if os.access(opts_cookiefile, os.R_OK):
1212 self.cookiejar.load()
1213
1214 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1215 self.cookiejar)
1216 if opts_proxy is not None:
1217 if opts_proxy == '':
1218 proxies = {}
1219 else:
1220 proxies = {'http': opts_proxy, 'https': opts_proxy}
1221 else:
1222 proxies = compat_urllib_request.getproxies()
1223 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1224 if 'http' in proxies and 'https' not in proxies:
1225 proxies['https'] = proxies['http']
1226 proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1227
1228 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1229 https_handler = make_HTTPS_handler(
1230 self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1231 ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1232 opener = compat_urllib_request.build_opener(
1233 https_handler, proxy_handler, cookie_processor, ydlh)
1234 # Delete the default user-agent header, which would otherwise apply in
1235 # cases where our custom HTTP handler doesn't come into play
1236 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1237 opener.addheaders = []
1238 self._opener = opener