]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
[youtube] Add support for search result URLs (Fixes #2495)
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import errno
8 import io
9 import json
10 import os
11 import platform
12 import re
13 import shutil
14 import subprocess
15 import socket
16 import sys
17 import time
18 import traceback
19
20 if os.name == 'nt':
21 import ctypes
22
23 from .utils import (
24 compat_cookiejar,
25 compat_http_client,
26 compat_str,
27 compat_urllib_error,
28 compat_urllib_request,
29 ContentTooShortError,
30 date_from_str,
31 DateRange,
32 determine_ext,
33 DownloadError,
34 encodeFilename,
35 ExtractorError,
36 format_bytes,
37 formatSeconds,
38 get_term_width,
39 locked_file,
40 make_HTTPS_handler,
41 MaxDownloadsReached,
42 PagedList,
43 PostProcessingError,
44 platform_name,
45 preferredencoding,
46 SameFileError,
47 sanitize_filename,
48 subtitles_filename,
49 takewhile_inclusive,
50 UnavailableVideoError,
51 url_basename,
52 write_json_file,
53 write_string,
54 YoutubeDLHandler,
55 prepend_extension,
56 )
57 from .extractor import get_info_extractor, gen_extractors
58 from .downloader import get_suitable_downloader
59 from .postprocessor import FFmpegMergerPP
60 from .version import __version__
61
62
63 class YoutubeDL(object):
64 """YoutubeDL class.
65
66 YoutubeDL objects are the ones responsible of downloading the
67 actual video file and writing it to disk if the user has requested
68 it, among some other tasks. In most cases there should be one per
69 program. As, given a video URL, the downloader doesn't know how to
70 extract all the needed information, task that InfoExtractors do, it
71 has to pass the URL to one of them.
72
73 For this, YoutubeDL objects have a method that allows
74 InfoExtractors to be registered in a given order. When it is passed
75 a URL, the YoutubeDL object handles it to the first InfoExtractor it
76 finds that reports being able to handle it. The InfoExtractor extracts
77 all the information about the video or videos the URL refers to, and
78 YoutubeDL process the extracted information, possibly using a File
79 Downloader to download the video.
80
81 YoutubeDL objects accept a lot of parameters. In order not to saturate
82 the object constructor with arguments, it receives a dictionary of
83 options instead. These options are available through the params
84 attribute for the InfoExtractors to use. The YoutubeDL also
85 registers itself as the downloader in charge for the InfoExtractors
86 that are added to it, so this is a "mutual registration".
87
88 Available options:
89
90 username: Username for authentication purposes.
91 password: Password for authentication purposes.
92 videopassword: Password for acces a video.
93 usenetrc: Use netrc for authentication instead.
94 verbose: Print additional info to stdout.
95 quiet: Do not print messages to stdout.
96 forceurl: Force printing final URL.
97 forcetitle: Force printing title.
98 forceid: Force printing ID.
99 forcethumbnail: Force printing thumbnail URL.
100 forcedescription: Force printing description.
101 forcefilename: Force printing final filename.
102 forceduration: Force printing duration.
103 forcejson: Force printing info_dict as JSON.
104 simulate: Do not download the video files.
105 format: Video format code.
106 format_limit: Highest quality format to try.
107 outtmpl: Template for output names.
108 restrictfilenames: Do not allow "&" and spaces in file names
109 ignoreerrors: Do not stop on download errors.
110 nooverwrites: Prevent overwriting files.
111 playliststart: Playlist item to start at.
112 playlistend: Playlist item to end at.
113 matchtitle: Download only matching titles.
114 rejecttitle: Reject downloads for matching titles.
115 logger: Log messages to a logging.Logger instance.
116 logtostderr: Log messages to stderr instead of stdout.
117 writedescription: Write the video description to a .description file
118 writeinfojson: Write the video description to a .info.json file
119 writeannotations: Write the video annotations to a .annotations.xml file
120 writethumbnail: Write the thumbnail image to a file
121 writesubtitles: Write the video subtitles to a file
122 writeautomaticsub: Write the automatic subtitles to a file
123 allsubtitles: Downloads all the subtitles of the video
124 (requires writesubtitles or writeautomaticsub)
125 listsubtitles: Lists all available subtitles for the video
126 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
127 subtitleslangs: List of languages of the subtitles to download
128 keepvideo: Keep the video file after post-processing
129 daterange: A DateRange object, download only if the upload_date is in the range.
130 skip_download: Skip the actual download of the video file
131 cachedir: Location of the cache files in the filesystem.
132 None to disable filesystem cache.
133 noplaylist: Download single video instead of a playlist if in doubt.
134 age_limit: An integer representing the user's age in years.
135 Unsuitable videos for the given age are skipped.
136 min_views: An integer representing the minimum view count the video
137 must have in order to not be skipped.
138 Videos without view count information are always
139 downloaded. None for no limit.
140 max_views: An integer representing the maximum view count.
141 Videos that are more popular than that are not
142 downloaded.
143 Videos without view count information are always
144 downloaded. None for no limit.
145 download_archive: File name of a file where all downloads are recorded.
146 Videos already present in the file are not downloaded
147 again.
148 cookiefile: File name where cookies should be read from and dumped to.
149 nocheckcertificate:Do not verify SSL certificates
150 proxy: URL of the proxy server to use
151 socket_timeout: Time to wait for unresponsive hosts, in seconds
152 bidi_workaround: Work around buggy terminals without bidirectional text
153 support, using fridibi
154 debug_printtraffic:Print out sent and received HTTP traffic
155 include_ads: Download ads as well
156 default_search: Prepend this string if an input url is not valid.
157 'auto' for elaborate guessing
158
159 The following parameters are not used by YoutubeDL itself, they are used by
160 the FileDownloader:
161 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
162 noresizebuffer, retries, continuedl, noprogress, consoletitle
163
164 The following options are used by the post processors:
165 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
166 otherwise prefer avconv.
167 """
168
169 params = None
170 _ies = []
171 _pps = []
172 _download_retcode = None
173 _num_downloads = None
174 _screen_file = None
175
176 def __init__(self, params=None):
177 """Create a FileDownloader object with the given options."""
178 if params is None:
179 params = {}
180 self._ies = []
181 self._ies_instances = {}
182 self._pps = []
183 self._progress_hooks = []
184 self._download_retcode = 0
185 self._num_downloads = 0
186 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
187 self._err_file = sys.stderr
188 self.params = params
189
190 if params.get('bidi_workaround', False):
191 try:
192 import pty
193 master, slave = pty.openpty()
194 width = get_term_width()
195 if width is None:
196 width_args = []
197 else:
198 width_args = ['-w', str(width)]
199 sp_kwargs = dict(
200 stdin=subprocess.PIPE,
201 stdout=slave,
202 stderr=self._err_file)
203 try:
204 self._output_process = subprocess.Popen(
205 ['bidiv'] + width_args, **sp_kwargs
206 )
207 except OSError:
208 self._output_process = subprocess.Popen(
209 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
210 self._output_channel = os.fdopen(master, 'rb')
211 except OSError as ose:
212 if ose.errno == 2:
213 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
214 else:
215 raise
216
217 if (sys.version_info >= (3,) and sys.platform != 'win32' and
218 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
219 and not params['restrictfilenames']):
220 # On Python 3, the Unicode filesystem API will throw errors (#1474)
221 self.report_warning(
222 'Assuming --restrict-filenames since file system encoding '
223 'cannot encode all charactes. '
224 'Set the LC_ALL environment variable to fix this.')
225 self.params['restrictfilenames'] = True
226
227 if '%(stitle)s' in self.params.get('outtmpl', ''):
228 self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
229
230 self._setup_opener()
231
232 def add_info_extractor(self, ie):
233 """Add an InfoExtractor object to the end of the list."""
234 self._ies.append(ie)
235 self._ies_instances[ie.ie_key()] = ie
236 ie.set_downloader(self)
237
238 def get_info_extractor(self, ie_key):
239 """
240 Get an instance of an IE with name ie_key, it will try to get one from
241 the _ies list, if there's no instance it will create a new one and add
242 it to the extractor list.
243 """
244 ie = self._ies_instances.get(ie_key)
245 if ie is None:
246 ie = get_info_extractor(ie_key)()
247 self.add_info_extractor(ie)
248 return ie
249
250 def add_default_info_extractors(self):
251 """
252 Add the InfoExtractors returned by gen_extractors to the end of the list
253 """
254 for ie in gen_extractors():
255 self.add_info_extractor(ie)
256
257 def add_post_processor(self, pp):
258 """Add a PostProcessor object to the end of the chain."""
259 self._pps.append(pp)
260 pp.set_downloader(self)
261
262 def add_progress_hook(self, ph):
263 """Add the progress hook (currently only for the file downloader)"""
264 self._progress_hooks.append(ph)
265
266 def _bidi_workaround(self, message):
267 if not hasattr(self, '_output_channel'):
268 return message
269
270 assert hasattr(self, '_output_process')
271 assert type(message) == type('')
272 line_count = message.count('\n') + 1
273 self._output_process.stdin.write((message + '\n').encode('utf-8'))
274 self._output_process.stdin.flush()
275 res = ''.join(self._output_channel.readline().decode('utf-8')
276 for _ in range(line_count))
277 return res[:-len('\n')]
278
279 def to_screen(self, message, skip_eol=False):
280 """Print message to stdout if not in quiet mode."""
281 return self.to_stdout(message, skip_eol, check_quiet=True)
282
283 def to_stdout(self, message, skip_eol=False, check_quiet=False):
284 """Print message to stdout if not in quiet mode."""
285 if self.params.get('logger'):
286 self.params['logger'].debug(message)
287 elif not check_quiet or not self.params.get('quiet', False):
288 message = self._bidi_workaround(message)
289 terminator = ['\n', ''][skip_eol]
290 output = message + terminator
291
292 write_string(output, self._screen_file)
293
294 def to_stderr(self, message):
295 """Print message to stderr."""
296 assert type(message) == type('')
297 if self.params.get('logger'):
298 self.params['logger'].error(message)
299 else:
300 message = self._bidi_workaround(message)
301 output = message + '\n'
302 write_string(output, self._err_file)
303
304 def to_console_title(self, message):
305 if not self.params.get('consoletitle', False):
306 return
307 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
308 # c_wchar_p() might not be necessary if `message` is
309 # already of type unicode()
310 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
311 elif 'TERM' in os.environ:
312 write_string('\033]0;%s\007' % message, self._screen_file)
313
314 def save_console_title(self):
315 if not self.params.get('consoletitle', False):
316 return
317 if 'TERM' in os.environ:
318 # Save the title on stack
319 write_string('\033[22;0t', self._screen_file)
320
321 def restore_console_title(self):
322 if not self.params.get('consoletitle', False):
323 return
324 if 'TERM' in os.environ:
325 # Restore the title from stack
326 write_string('\033[23;0t', self._screen_file)
327
328 def __enter__(self):
329 self.save_console_title()
330 return self
331
332 def __exit__(self, *args):
333 self.restore_console_title()
334
335 if self.params.get('cookiefile') is not None:
336 self.cookiejar.save()
337
338 def trouble(self, message=None, tb=None):
339 """Determine action to take when a download problem appears.
340
341 Depending on if the downloader has been configured to ignore
342 download errors or not, this method may throw an exception or
343 not when errors are found, after printing the message.
344
345 tb, if given, is additional traceback information.
346 """
347 if message is not None:
348 self.to_stderr(message)
349 if self.params.get('verbose'):
350 if tb is None:
351 if sys.exc_info()[0]: # if .trouble has been called from an except block
352 tb = ''
353 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
354 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
355 tb += compat_str(traceback.format_exc())
356 else:
357 tb_data = traceback.format_list(traceback.extract_stack())
358 tb = ''.join(tb_data)
359 self.to_stderr(tb)
360 if not self.params.get('ignoreerrors', False):
361 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
362 exc_info = sys.exc_info()[1].exc_info
363 else:
364 exc_info = sys.exc_info()
365 raise DownloadError(message, exc_info)
366 self._download_retcode = 1
367
368 def report_warning(self, message):
369 '''
370 Print the message to stderr, it will be prefixed with 'WARNING:'
371 If stderr is a tty file the 'WARNING:' will be colored
372 '''
373 if self._err_file.isatty() and os.name != 'nt':
374 _msg_header = '\033[0;33mWARNING:\033[0m'
375 else:
376 _msg_header = 'WARNING:'
377 warning_message = '%s %s' % (_msg_header, message)
378 self.to_stderr(warning_message)
379
380 def report_error(self, message, tb=None):
381 '''
382 Do the same as trouble, but prefixes the message with 'ERROR:', colored
383 in red if stderr is a tty file.
384 '''
385 if self._err_file.isatty() and os.name != 'nt':
386 _msg_header = '\033[0;31mERROR:\033[0m'
387 else:
388 _msg_header = 'ERROR:'
389 error_message = '%s %s' % (_msg_header, message)
390 self.trouble(error_message, tb)
391
392 def report_file_already_downloaded(self, file_name):
393 """Report file has already been fully downloaded."""
394 try:
395 self.to_screen('[download] %s has already been downloaded' % file_name)
396 except UnicodeEncodeError:
397 self.to_screen('[download] The file has already been downloaded')
398
399 def prepare_filename(self, info_dict):
400 """Generate the output filename."""
401 try:
402 template_dict = dict(info_dict)
403
404 template_dict['epoch'] = int(time.time())
405 autonumber_size = self.params.get('autonumber_size')
406 if autonumber_size is None:
407 autonumber_size = 5
408 autonumber_templ = '%0' + str(autonumber_size) + 'd'
409 template_dict['autonumber'] = autonumber_templ % self._num_downloads
410 if template_dict.get('playlist_index') is not None:
411 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
412
413 sanitize = lambda k, v: sanitize_filename(
414 compat_str(v),
415 restricted=self.params.get('restrictfilenames'),
416 is_id=(k == 'id'))
417 template_dict = dict((k, sanitize(k, v))
418 for k, v in template_dict.items()
419 if v is not None)
420 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
421
422 tmpl = os.path.expanduser(self.params['outtmpl'])
423 filename = tmpl % template_dict
424 return filename
425 except ValueError as err:
426 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
427 return None
428
429 def _match_entry(self, info_dict):
430 """ Returns None iff the file should be downloaded """
431
432 video_title = info_dict.get('title', info_dict.get('id', 'video'))
433 if 'title' in info_dict:
434 # This can happen when we're just evaluating the playlist
435 title = info_dict['title']
436 matchtitle = self.params.get('matchtitle', False)
437 if matchtitle:
438 if not re.search(matchtitle, title, re.IGNORECASE):
439 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
440 rejecttitle = self.params.get('rejecttitle', False)
441 if rejecttitle:
442 if re.search(rejecttitle, title, re.IGNORECASE):
443 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
444 date = info_dict.get('upload_date', None)
445 if date is not None:
446 dateRange = self.params.get('daterange', DateRange())
447 if date not in dateRange:
448 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
449 view_count = info_dict.get('view_count', None)
450 if view_count is not None:
451 min_views = self.params.get('min_views')
452 if min_views is not None and view_count < min_views:
453 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
454 max_views = self.params.get('max_views')
455 if max_views is not None and view_count > max_views:
456 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
457 age_limit = self.params.get('age_limit')
458 if age_limit is not None:
459 if age_limit < info_dict.get('age_limit', 0):
460 return 'Skipping "' + title + '" because it is age restricted'
461 if self.in_download_archive(info_dict):
462 return '%s has already been recorded in archive' % video_title
463 return None
464
465 @staticmethod
466 def add_extra_info(info_dict, extra_info):
467 '''Set the keys from extra_info in info dict if they are missing'''
468 for key, value in extra_info.items():
469 info_dict.setdefault(key, value)
470
471 def extract_info(self, url, download=True, ie_key=None, extra_info={},
472 process=True):
473 '''
474 Returns a list with a dictionary for each video we find.
475 If 'download', also downloads the videos.
476 extra_info is a dict containing the extra values to add to each result
477 '''
478
479 if ie_key:
480 ies = [self.get_info_extractor(ie_key)]
481 else:
482 ies = self._ies
483
484 for ie in ies:
485 if not ie.suitable(url):
486 continue
487
488 if not ie.working():
489 self.report_warning('The program functionality for this site has been marked as broken, '
490 'and will probably not work.')
491
492 try:
493 ie_result = ie.extract(url)
494 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
495 break
496 if isinstance(ie_result, list):
497 # Backwards compatibility: old IE result format
498 ie_result = {
499 '_type': 'compat_list',
500 'entries': ie_result,
501 }
502 self.add_extra_info(ie_result,
503 {
504 'extractor': ie.IE_NAME,
505 'webpage_url': url,
506 'webpage_url_basename': url_basename(url),
507 'extractor_key': ie.ie_key(),
508 })
509 if process:
510 return self.process_ie_result(ie_result, download, extra_info)
511 else:
512 return ie_result
513 except ExtractorError as de: # An error we somewhat expected
514 self.report_error(compat_str(de), de.format_traceback())
515 break
516 except MaxDownloadsReached:
517 raise
518 except Exception as e:
519 if self.params.get('ignoreerrors', False):
520 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
521 break
522 else:
523 raise
524 else:
525 self.report_error('no suitable InfoExtractor: %s' % url)
526
527 def process_ie_result(self, ie_result, download=True, extra_info={}):
528 """
529 Take the result of the ie(may be modified) and resolve all unresolved
530 references (URLs, playlist items).
531
532 It will also download the videos if 'download'.
533 Returns the resolved ie_result.
534 """
535
536 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
537 if result_type == 'video':
538 self.add_extra_info(ie_result, extra_info)
539 return self.process_video_result(ie_result, download=download)
540 elif result_type == 'url':
541 # We have to add extra_info to the results because it may be
542 # contained in a playlist
543 return self.extract_info(ie_result['url'],
544 download,
545 ie_key=ie_result.get('ie_key'),
546 extra_info=extra_info)
547 elif result_type == 'url_transparent':
548 # Use the information from the embedding page
549 info = self.extract_info(
550 ie_result['url'], ie_key=ie_result.get('ie_key'),
551 extra_info=extra_info, download=False, process=False)
552
553 def make_result(embedded_info):
554 new_result = ie_result.copy()
555 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
556 'entries', 'ie_key', 'duration',
557 'subtitles', 'annotations', 'format',
558 'thumbnail', 'thumbnails'):
559 if f in new_result:
560 del new_result[f]
561 if f in embedded_info:
562 new_result[f] = embedded_info[f]
563 return new_result
564 new_result = make_result(info)
565
566 assert new_result.get('_type') != 'url_transparent'
567 if new_result.get('_type') == 'compat_list':
568 new_result['entries'] = [
569 make_result(e) for e in new_result['entries']]
570
571 return self.process_ie_result(
572 new_result, download=download, extra_info=extra_info)
573 elif result_type == 'playlist':
574 # We process each entry in the playlist
575 playlist = ie_result.get('title', None) or ie_result.get('id', None)
576 self.to_screen('[download] Downloading playlist: %s' % playlist)
577
578 playlist_results = []
579
580 playliststart = self.params.get('playliststart', 1) - 1
581 playlistend = self.params.get('playlistend', None)
582 # For backwards compatibility, interpret -1 as whole list
583 if playlistend == -1:
584 playlistend = None
585
586 if isinstance(ie_result['entries'], list):
587 n_all_entries = len(ie_result['entries'])
588 entries = ie_result['entries'][playliststart:playlistend]
589 n_entries = len(entries)
590 self.to_screen(
591 "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
592 (ie_result['extractor'], playlist, n_all_entries, n_entries))
593 else:
594 assert isinstance(ie_result['entries'], PagedList)
595 entries = ie_result['entries'].getslice(
596 playliststart, playlistend)
597 n_entries = len(entries)
598 self.to_screen(
599 "[%s] playlist %s: Downloading %d videos" %
600 (ie_result['extractor'], playlist, n_entries))
601
602 for i, entry in enumerate(entries, 1):
603 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
604 extra = {
605 'playlist': playlist,
606 'playlist_index': i + playliststart,
607 'extractor': ie_result['extractor'],
608 'webpage_url': ie_result['webpage_url'],
609 'webpage_url_basename': url_basename(ie_result['webpage_url']),
610 'extractor_key': ie_result['extractor_key'],
611 }
612
613 reason = self._match_entry(entry)
614 if reason is not None:
615 self.to_screen('[download] ' + reason)
616 continue
617
618 entry_result = self.process_ie_result(entry,
619 download=download,
620 extra_info=extra)
621 playlist_results.append(entry_result)
622 ie_result['entries'] = playlist_results
623 return ie_result
624 elif result_type == 'compat_list':
625 def _fixup(r):
626 self.add_extra_info(r,
627 {
628 'extractor': ie_result['extractor'],
629 'webpage_url': ie_result['webpage_url'],
630 'webpage_url_basename': url_basename(ie_result['webpage_url']),
631 'extractor_key': ie_result['extractor_key'],
632 })
633 return r
634 ie_result['entries'] = [
635 self.process_ie_result(_fixup(r), download, extra_info)
636 for r in ie_result['entries']
637 ]
638 return ie_result
639 else:
640 raise Exception('Invalid result type: %s' % result_type)
641
642 def select_format(self, format_spec, available_formats):
643 if format_spec == 'best' or format_spec is None:
644 return available_formats[-1]
645 elif format_spec == 'worst':
646 return available_formats[0]
647 elif format_spec == 'bestaudio':
648 audio_formats = [
649 f for f in available_formats
650 if f.get('vcodec') == 'none']
651 if audio_formats:
652 return audio_formats[-1]
653 elif format_spec == 'worstaudio':
654 audio_formats = [
655 f for f in available_formats
656 if f.get('vcodec') == 'none']
657 if audio_formats:
658 return audio_formats[0]
659 else:
660 extensions = ['mp4', 'flv', 'webm', '3gp']
661 if format_spec in extensions:
662 filter_f = lambda f: f['ext'] == format_spec
663 else:
664 filter_f = lambda f: f['format_id'] == format_spec
665 matches = list(filter(filter_f, available_formats))
666 if matches:
667 return matches[-1]
668 return None
669
670 def process_video_result(self, info_dict, download=True):
671 assert info_dict.get('_type', 'video') == 'video'
672
673 if 'playlist' not in info_dict:
674 # It isn't part of a playlist
675 info_dict['playlist'] = None
676 info_dict['playlist_index'] = None
677
678 if 'display_id' not in info_dict and 'id' in info_dict:
679 info_dict['display_id'] = info_dict['id']
680
681 # This extractors handle format selection themselves
682 if info_dict['extractor'] in ['Youku']:
683 if download:
684 self.process_info(info_dict)
685 return info_dict
686
687 # We now pick which formats have to be downloaded
688 if info_dict.get('formats') is None:
689 # There's only one format available
690 formats = [info_dict]
691 else:
692 formats = info_dict['formats']
693
694 # We check that all the formats have the format and format_id fields
695 for (i, format) in enumerate(formats):
696 if format.get('format_id') is None:
697 format['format_id'] = compat_str(i)
698 if format.get('format') is None:
699 format['format'] = '{id} - {res}{note}'.format(
700 id=format['format_id'],
701 res=self.format_resolution(format),
702 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
703 )
704 # Automatically determine file extension if missing
705 if 'ext' not in format:
706 format['ext'] = determine_ext(format['url'])
707
708 format_limit = self.params.get('format_limit', None)
709 if format_limit:
710 formats = list(takewhile_inclusive(
711 lambda f: f['format_id'] != format_limit, formats
712 ))
713
714 # TODO Central sorting goes here
715
716 if formats[0] is not info_dict:
717 # only set the 'formats' fields if the original info_dict list them
718 # otherwise we end up with a circular reference, the first (and unique)
719 # element in the 'formats' field in info_dict is info_dict itself,
720 # wich can't be exported to json
721 info_dict['formats'] = formats
722 if self.params.get('listformats', None):
723 self.list_formats(info_dict)
724 return
725
726 req_format = self.params.get('format')
727 if req_format is None:
728 req_format = 'best'
729 formats_to_download = []
730 # The -1 is for supporting YoutubeIE
731 if req_format in ('-1', 'all'):
732 formats_to_download = formats
733 else:
734 # We can accept formats requested in the format: 34/5/best, we pick
735 # the first that is available, starting from left
736 req_formats = req_format.split('/')
737 for rf in req_formats:
738 if re.match(r'.+?\+.+?', rf) is not None:
739 # Two formats have been requested like '137+139'
740 format_1, format_2 = rf.split('+')
741 formats_info = (self.select_format(format_1, formats),
742 self.select_format(format_2, formats))
743 if all(formats_info):
744 selected_format = {
745 'requested_formats': formats_info,
746 'format': rf,
747 'ext': formats_info[0]['ext'],
748 }
749 else:
750 selected_format = None
751 else:
752 selected_format = self.select_format(rf, formats)
753 if selected_format is not None:
754 formats_to_download = [selected_format]
755 break
756 if not formats_to_download:
757 raise ExtractorError('requested format not available',
758 expected=True)
759
760 if download:
761 if len(formats_to_download) > 1:
762 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
763 for format in formats_to_download:
764 new_info = dict(info_dict)
765 new_info.update(format)
766 self.process_info(new_info)
767 # We update the info dict with the best quality format (backwards compatibility)
768 info_dict.update(formats_to_download[-1])
769 return info_dict
770
771 def process_info(self, info_dict):
772 """Process a single resolved IE result."""
773
774 assert info_dict.get('_type', 'video') == 'video'
775
776 max_downloads = self.params.get('max_downloads')
777 if max_downloads is not None:
778 if self._num_downloads >= int(max_downloads):
779 raise MaxDownloadsReached()
780
781 info_dict['fulltitle'] = info_dict['title']
782 if len(info_dict['title']) > 200:
783 info_dict['title'] = info_dict['title'][:197] + '...'
784
785 # Keep for backwards compatibility
786 info_dict['stitle'] = info_dict['title']
787
788 if not 'format' in info_dict:
789 info_dict['format'] = info_dict['ext']
790
791 reason = self._match_entry(info_dict)
792 if reason is not None:
793 self.to_screen('[download] ' + reason)
794 return
795
796 self._num_downloads += 1
797
798 filename = self.prepare_filename(info_dict)
799
800 # Forced printings
801 if self.params.get('forcetitle', False):
802 self.to_stdout(info_dict['fulltitle'])
803 if self.params.get('forceid', False):
804 self.to_stdout(info_dict['id'])
805 if self.params.get('forceurl', False):
806 # For RTMP URLs, also include the playpath
807 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
808 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
809 self.to_stdout(info_dict['thumbnail'])
810 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
811 self.to_stdout(info_dict['description'])
812 if self.params.get('forcefilename', False) and filename is not None:
813 self.to_stdout(filename)
814 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
815 self.to_stdout(formatSeconds(info_dict['duration']))
816 if self.params.get('forceformat', False):
817 self.to_stdout(info_dict['format'])
818 if self.params.get('forcejson', False):
819 info_dict['_filename'] = filename
820 self.to_stdout(json.dumps(info_dict))
821
822 # Do nothing else if in simulate mode
823 if self.params.get('simulate', False):
824 return
825
826 if filename is None:
827 return
828
829 try:
830 dn = os.path.dirname(encodeFilename(filename))
831 if dn != '' and not os.path.exists(dn):
832 os.makedirs(dn)
833 except (OSError, IOError) as err:
834 self.report_error('unable to create directory ' + compat_str(err))
835 return
836
837 if self.params.get('writedescription', False):
838 descfn = filename + '.description'
839 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
840 self.to_screen('[info] Video description is already present')
841 else:
842 try:
843 self.to_screen('[info] Writing video description to: ' + descfn)
844 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
845 descfile.write(info_dict['description'])
846 except (KeyError, TypeError):
847 self.report_warning('There\'s no description to write.')
848 except (OSError, IOError):
849 self.report_error('Cannot write description file ' + descfn)
850 return
851
852 if self.params.get('writeannotations', False):
853 annofn = filename + '.annotations.xml'
854 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
855 self.to_screen('[info] Video annotations are already present')
856 else:
857 try:
858 self.to_screen('[info] Writing video annotations to: ' + annofn)
859 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
860 annofile.write(info_dict['annotations'])
861 except (KeyError, TypeError):
862 self.report_warning('There are no annotations to write.')
863 except (OSError, IOError):
864 self.report_error('Cannot write annotations file: ' + annofn)
865 return
866
867 subtitles_are_requested = any([self.params.get('writesubtitles', False),
868 self.params.get('writeautomaticsub')])
869
870 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
871 # subtitles download errors are already managed as troubles in relevant IE
872 # that way it will silently go on when used with unsupporting IE
873 subtitles = info_dict['subtitles']
874 sub_format = self.params.get('subtitlesformat', 'srt')
875 for sub_lang in subtitles.keys():
876 sub = subtitles[sub_lang]
877 if sub is None:
878 continue
879 try:
880 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
881 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
882 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
883 else:
884 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
885 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
886 subfile.write(sub)
887 except (OSError, IOError):
888 self.report_error('Cannot write subtitles file ' + descfn)
889 return
890
891 if self.params.get('writeinfojson', False):
892 infofn = os.path.splitext(filename)[0] + '.info.json'
893 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
894 self.to_screen('[info] Video description metadata is already present')
895 else:
896 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
897 try:
898 write_json_file(info_dict, encodeFilename(infofn))
899 except (OSError, IOError):
900 self.report_error('Cannot write metadata to JSON file ' + infofn)
901 return
902
903 if self.params.get('writethumbnail', False):
904 if info_dict.get('thumbnail') is not None:
905 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
906 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
907 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
908 self.to_screen('[%s] %s: Thumbnail is already present' %
909 (info_dict['extractor'], info_dict['id']))
910 else:
911 self.to_screen('[%s] %s: Downloading thumbnail ...' %
912 (info_dict['extractor'], info_dict['id']))
913 try:
914 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
915 with open(thumb_filename, 'wb') as thumbf:
916 shutil.copyfileobj(uf, thumbf)
917 self.to_screen('[%s] %s: Writing thumbnail to: %s' %
918 (info_dict['extractor'], info_dict['id'], thumb_filename))
919 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
920 self.report_warning('Unable to download thumbnail "%s": %s' %
921 (info_dict['thumbnail'], compat_str(err)))
922
923 if not self.params.get('skip_download', False):
924 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
925 success = True
926 else:
927 try:
928 def dl(name, info):
929 fd = get_suitable_downloader(info)(self, self.params)
930 for ph in self._progress_hooks:
931 fd.add_progress_hook(ph)
932 return fd.download(name, info)
933 if info_dict.get('requested_formats') is not None:
934 downloaded = []
935 success = True
936 merger = FFmpegMergerPP(self)
937 if not merger._get_executable():
938 postprocessors = []
939 self.report_warning('You have requested multiple '
940 'formats but ffmpeg or avconv are not installed.'
941 ' The formats won\'t be merged')
942 else:
943 postprocessors = [merger]
944 for f in info_dict['requested_formats']:
945 new_info = dict(info_dict)
946 new_info.update(f)
947 fname = self.prepare_filename(new_info)
948 fname = prepend_extension(fname, 'f%s' % f['format_id'])
949 downloaded.append(fname)
950 partial_success = dl(fname, new_info)
951 success = success and partial_success
952 info_dict['__postprocessors'] = postprocessors
953 info_dict['__files_to_merge'] = downloaded
954 else:
955 # Just a single file
956 success = dl(filename, info_dict)
957 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
958 self.report_error('unable to download video data: %s' % str(err))
959 return
960 except (OSError, IOError) as err:
961 raise UnavailableVideoError(err)
962 except (ContentTooShortError, ) as err:
963 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
964 return
965
966 if success:
967 try:
968 self.post_process(filename, info_dict)
969 except (PostProcessingError) as err:
970 self.report_error('postprocessing: %s' % str(err))
971 return
972
973 self.record_download_archive(info_dict)
974
975 def download(self, url_list):
976 """Download a given list of URLs."""
977 if (len(url_list) > 1 and
978 '%' not in self.params['outtmpl']
979 and self.params.get('max_downloads') != 1):
980 raise SameFileError(self.params['outtmpl'])
981
982 for url in url_list:
983 try:
984 #It also downloads the videos
985 self.extract_info(url)
986 except UnavailableVideoError:
987 self.report_error('unable to download video')
988 except MaxDownloadsReached:
989 self.to_screen('[info] Maximum number of downloaded files reached.')
990 raise
991
992 return self._download_retcode
993
994 def download_with_info_file(self, info_filename):
995 with io.open(info_filename, 'r', encoding='utf-8') as f:
996 info = json.load(f)
997 try:
998 self.process_ie_result(info, download=True)
999 except DownloadError:
1000 webpage_url = info.get('webpage_url')
1001 if webpage_url is not None:
1002 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1003 return self.download([webpage_url])
1004 else:
1005 raise
1006 return self._download_retcode
1007
1008 def post_process(self, filename, ie_info):
1009 """Run all the postprocessors on the given file."""
1010 info = dict(ie_info)
1011 info['filepath'] = filename
1012 keep_video = None
1013 pps_chain = []
1014 if ie_info.get('__postprocessors') is not None:
1015 pps_chain.extend(ie_info['__postprocessors'])
1016 pps_chain.extend(self._pps)
1017 for pp in pps_chain:
1018 try:
1019 keep_video_wish, new_info = pp.run(info)
1020 if keep_video_wish is not None:
1021 if keep_video_wish:
1022 keep_video = keep_video_wish
1023 elif keep_video is None:
1024 # No clear decision yet, let IE decide
1025 keep_video = keep_video_wish
1026 except PostProcessingError as e:
1027 self.report_error(e.msg)
1028 if keep_video is False and not self.params.get('keepvideo', False):
1029 try:
1030 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1031 os.remove(encodeFilename(filename))
1032 except (IOError, OSError):
1033 self.report_warning('Unable to remove downloaded video file')
1034
1035 def _make_archive_id(self, info_dict):
1036 # Future-proof against any change in case
1037 # and backwards compatibility with prior versions
1038 extractor = info_dict.get('extractor_key')
1039 if extractor is None:
1040 if 'id' in info_dict:
1041 extractor = info_dict.get('ie_key') # key in a playlist
1042 if extractor is None:
1043 return None # Incomplete video information
1044 return extractor.lower() + ' ' + info_dict['id']
1045
1046 def in_download_archive(self, info_dict):
1047 fn = self.params.get('download_archive')
1048 if fn is None:
1049 return False
1050
1051 vid_id = self._make_archive_id(info_dict)
1052 if vid_id is None:
1053 return False # Incomplete video information
1054
1055 try:
1056 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1057 for line in archive_file:
1058 if line.strip() == vid_id:
1059 return True
1060 except IOError as ioe:
1061 if ioe.errno != errno.ENOENT:
1062 raise
1063 return False
1064
1065 def record_download_archive(self, info_dict):
1066 fn = self.params.get('download_archive')
1067 if fn is None:
1068 return
1069 vid_id = self._make_archive_id(info_dict)
1070 assert vid_id
1071 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1072 archive_file.write(vid_id + '\n')
1073
1074 @staticmethod
1075 def format_resolution(format, default='unknown'):
1076 if format.get('vcodec') == 'none':
1077 return 'audio only'
1078 if format.get('resolution') is not None:
1079 return format['resolution']
1080 if format.get('height') is not None:
1081 if format.get('width') is not None:
1082 res = '%sx%s' % (format['width'], format['height'])
1083 else:
1084 res = '%sp' % format['height']
1085 elif format.get('width') is not None:
1086 res = '?x%d' % format['width']
1087 else:
1088 res = default
1089 return res
1090
1091 def list_formats(self, info_dict):
1092 def format_note(fdict):
1093 res = ''
1094 if fdict.get('ext') in ['f4f', 'f4m']:
1095 res += '(unsupported) '
1096 if fdict.get('format_note') is not None:
1097 res += fdict['format_note'] + ' '
1098 if fdict.get('tbr') is not None:
1099 res += '%4dk ' % fdict['tbr']
1100 if fdict.get('container') is not None:
1101 if res:
1102 res += ', '
1103 res += '%s container' % fdict['container']
1104 if (fdict.get('vcodec') is not None and
1105 fdict.get('vcodec') != 'none'):
1106 if res:
1107 res += ', '
1108 res += fdict['vcodec']
1109 if fdict.get('vbr') is not None:
1110 res += '@'
1111 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1112 res += 'video@'
1113 if fdict.get('vbr') is not None:
1114 res += '%4dk' % fdict['vbr']
1115 if fdict.get('acodec') is not None:
1116 if res:
1117 res += ', '
1118 if fdict['acodec'] == 'none':
1119 res += 'video only'
1120 else:
1121 res += '%-5s' % fdict['acodec']
1122 elif fdict.get('abr') is not None:
1123 if res:
1124 res += ', '
1125 res += 'audio'
1126 if fdict.get('abr') is not None:
1127 res += '@%3dk' % fdict['abr']
1128 if fdict.get('asr') is not None:
1129 res += ' (%5dHz)' % fdict['asr']
1130 if fdict.get('filesize') is not None:
1131 if res:
1132 res += ', '
1133 res += format_bytes(fdict['filesize'])
1134 return res
1135
1136 def line(format, idlen=20):
1137 return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1138 format['format_id'],
1139 format['ext'],
1140 self.format_resolution(format),
1141 format_note(format),
1142 ))
1143
1144 formats = info_dict.get('formats', [info_dict])
1145 idlen = max(len('format code'),
1146 max(len(f['format_id']) for f in formats))
1147 formats_s = [line(f, idlen) for f in formats]
1148 if len(formats) > 1:
1149 formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1150 formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1151
1152 header_line = line({
1153 'format_id': 'format code', 'ext': 'extension',
1154 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1155 self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1156 (info_dict['id'], header_line, '\n'.join(formats_s)))
1157
1158 def urlopen(self, req):
1159 """ Start an HTTP download """
1160 return self._opener.open(req)
1161
1162 def print_debug_header(self):
1163 if not self.params.get('verbose'):
1164 return
1165 write_string('[debug] youtube-dl version ' + __version__ + '\n')
1166 try:
1167 sp = subprocess.Popen(
1168 ['git', 'rev-parse', '--short', 'HEAD'],
1169 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1170 cwd=os.path.dirname(os.path.abspath(__file__)))
1171 out, err = sp.communicate()
1172 out = out.decode().strip()
1173 if re.match('[0-9a-f]+', out):
1174 write_string('[debug] Git HEAD: ' + out + '\n')
1175 except:
1176 try:
1177 sys.exc_clear()
1178 except:
1179 pass
1180 write_string('[debug] Python version %s - %s' %
1181 (platform.python_version(), platform_name()) + '\n')
1182
1183 proxy_map = {}
1184 for handler in self._opener.handlers:
1185 if hasattr(handler, 'proxies'):
1186 proxy_map.update(handler.proxies)
1187 write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1188
1189 def _setup_opener(self):
1190 timeout_val = self.params.get('socket_timeout')
1191 timeout = 600 if timeout_val is None else float(timeout_val)
1192
1193 opts_cookiefile = self.params.get('cookiefile')
1194 opts_proxy = self.params.get('proxy')
1195
1196 if opts_cookiefile is None:
1197 self.cookiejar = compat_cookiejar.CookieJar()
1198 else:
1199 self.cookiejar = compat_cookiejar.MozillaCookieJar(
1200 opts_cookiefile)
1201 if os.access(opts_cookiefile, os.R_OK):
1202 self.cookiejar.load()
1203
1204 cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1205 self.cookiejar)
1206 if opts_proxy is not None:
1207 if opts_proxy == '':
1208 proxies = {}
1209 else:
1210 proxies = {'http': opts_proxy, 'https': opts_proxy}
1211 else:
1212 proxies = compat_urllib_request.getproxies()
1213 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1214 if 'http' in proxies and 'https' not in proxies:
1215 proxies['https'] = proxies['http']
1216 proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1217
1218 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1219 https_handler = make_HTTPS_handler(
1220 self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1221 ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1222 opener = compat_urllib_request.build_opener(
1223 https_handler, proxy_handler, cookie_processor, ydlh)
1224 # Delete the default user-agent header, which would otherwise apply in
1225 # cases where our custom HTTP handler doesn't come into play
1226 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1227 opener.addheaders = []
1228 self._opener = opener
1229
1230 # TODO remove this global modification
1231 compat_urllib_request.install_opener(opener)
1232 socket.setdefaulttimeout(timeout)