]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/YoutubeDL.py
[vrt] Add support for direct hls playlists and YouTube (Closes #9108)
[yt-dlp.git] / youtube_dl / YoutubeDL.py
... / ...
CommitLineData
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4from __future__ import absolute_import, unicode_literals
5
6import collections
7import contextlib
8import datetime
9import errno
10import fileinput
11import io
12import itertools
13import json
14import locale
15import operator
16import os
17import platform
18import re
19import shutil
20import subprocess
21import socket
22import sys
23import time
24import tokenize
25import traceback
26
27from .compat import (
28 compat_basestring,
29 compat_cookiejar,
30 compat_expanduser,
31 compat_get_terminal_size,
32 compat_http_client,
33 compat_kwargs,
34 compat_os_name,
35 compat_str,
36 compat_tokenize_tokenize,
37 compat_urllib_error,
38 compat_urllib_request,
39 compat_urllib_request_DataHandler,
40)
41from .utils import (
42 age_restricted,
43 args_to_str,
44 ContentTooShortError,
45 date_from_str,
46 DateRange,
47 DEFAULT_OUTTMPL,
48 determine_ext,
49 determine_protocol,
50 DownloadError,
51 encode_compat_str,
52 encodeFilename,
53 error_to_compat_str,
54 ExtractorError,
55 format_bytes,
56 formatSeconds,
57 locked_file,
58 make_HTTPS_handler,
59 MaxDownloadsReached,
60 PagedList,
61 parse_filesize,
62 PerRequestProxyHandler,
63 platform_name,
64 PostProcessingError,
65 preferredencoding,
66 prepend_extension,
67 render_table,
68 replace_extension,
69 SameFileError,
70 sanitize_filename,
71 sanitize_path,
72 sanitize_url,
73 sanitized_Request,
74 std_headers,
75 subtitles_filename,
76 UnavailableVideoError,
77 url_basename,
78 version_tuple,
79 write_json_file,
80 write_string,
81 YoutubeDLCookieProcessor,
82 YoutubeDLHandler,
83)
84from .cache import Cache
85from .extractor import get_info_extractor, gen_extractors
86from .downloader import get_suitable_downloader
87from .downloader.rtmp import rtmpdump_version
88from .postprocessor import (
89 FFmpegFixupM3u8PP,
90 FFmpegFixupM4aPP,
91 FFmpegFixupStretchedPP,
92 FFmpegMergerPP,
93 FFmpegPostProcessor,
94 get_postprocessor,
95)
96from .version import __version__
97
98if compat_os_name == 'nt':
99 import ctypes
100
101
102class YoutubeDL(object):
103 """YoutubeDL class.
104
105 YoutubeDL objects are the ones responsible of downloading the
106 actual video file and writing it to disk if the user has requested
107 it, among some other tasks. In most cases there should be one per
108 program. As, given a video URL, the downloader doesn't know how to
109 extract all the needed information, task that InfoExtractors do, it
110 has to pass the URL to one of them.
111
112 For this, YoutubeDL objects have a method that allows
113 InfoExtractors to be registered in a given order. When it is passed
114 a URL, the YoutubeDL object handles it to the first InfoExtractor it
115 finds that reports being able to handle it. The InfoExtractor extracts
116 all the information about the video or videos the URL refers to, and
117 YoutubeDL process the extracted information, possibly using a File
118 Downloader to download the video.
119
120 YoutubeDL objects accept a lot of parameters. In order not to saturate
121 the object constructor with arguments, it receives a dictionary of
122 options instead. These options are available through the params
123 attribute for the InfoExtractors to use. The YoutubeDL also
124 registers itself as the downloader in charge for the InfoExtractors
125 that are added to it, so this is a "mutual registration".
126
127 Available options:
128
129 username: Username for authentication purposes.
130 password: Password for authentication purposes.
131 videopassword: Password for accessing a video.
132 usenetrc: Use netrc for authentication instead.
133 verbose: Print additional info to stdout.
134 quiet: Do not print messages to stdout.
135 no_warnings: Do not print out anything for warnings.
136 forceurl: Force printing final URL.
137 forcetitle: Force printing title.
138 forceid: Force printing ID.
139 forcethumbnail: Force printing thumbnail URL.
140 forcedescription: Force printing description.
141 forcefilename: Force printing final filename.
142 forceduration: Force printing duration.
143 forcejson: Force printing info_dict as JSON.
144 dump_single_json: Force printing the info_dict of the whole playlist
145 (or video) as a single JSON line.
146 simulate: Do not download the video files.
147 format: Video format code. See options.py for more information.
148 outtmpl: Template for output names.
149 restrictfilenames: Do not allow "&" and spaces in file names
150 ignoreerrors: Do not stop on download errors.
151 force_generic_extractor: Force downloader to use the generic extractor
152 nooverwrites: Prevent overwriting files.
153 playliststart: Playlist item to start at.
154 playlistend: Playlist item to end at.
155 playlist_items: Specific indices of playlist to download.
156 playlistreverse: Download playlist items in reverse order.
157 matchtitle: Download only matching titles.
158 rejecttitle: Reject downloads for matching titles.
159 logger: Log messages to a logging.Logger instance.
160 logtostderr: Log messages to stderr instead of stdout.
161 writedescription: Write the video description to a .description file
162 writeinfojson: Write the video description to a .info.json file
163 writeannotations: Write the video annotations to a .annotations.xml file
164 writethumbnail: Write the thumbnail image to a file
165 write_all_thumbnails: Write all thumbnail formats to files
166 writesubtitles: Write the video subtitles to a file
167 writeautomaticsub: Write the automatically generated subtitles to a file
168 allsubtitles: Downloads all the subtitles of the video
169 (requires writesubtitles or writeautomaticsub)
170 listsubtitles: Lists all available subtitles for the video
171 subtitlesformat: The format code for subtitles
172 subtitleslangs: List of languages of the subtitles to download
173 keepvideo: Keep the video file after post-processing
174 daterange: A DateRange object, download only if the upload_date is in the range.
175 skip_download: Skip the actual download of the video file
176 cachedir: Location of the cache files in the filesystem.
177 False to disable filesystem cache.
178 noplaylist: Download single video instead of a playlist if in doubt.
179 age_limit: An integer representing the user's age in years.
180 Unsuitable videos for the given age are skipped.
181 min_views: An integer representing the minimum view count the video
182 must have in order to not be skipped.
183 Videos without view count information are always
184 downloaded. None for no limit.
185 max_views: An integer representing the maximum view count.
186 Videos that are more popular than that are not
187 downloaded.
188 Videos without view count information are always
189 downloaded. None for no limit.
190 download_archive: File name of a file where all downloads are recorded.
191 Videos already present in the file are not downloaded
192 again.
193 cookiefile: File name where cookies should be read from and dumped to.
194 nocheckcertificate:Do not verify SSL certificates
195 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
196 At the moment, this is only supported by YouTube.
197 proxy: URL of the proxy server to use
198 cn_verification_proxy: URL of the proxy to use for IP address verification
199 on Chinese sites. (Experimental)
200 socket_timeout: Time to wait for unresponsive hosts, in seconds
201 bidi_workaround: Work around buggy terminals without bidirectional text
202 support, using fridibi
203 debug_printtraffic:Print out sent and received HTTP traffic
204 include_ads: Download ads as well
205 default_search: Prepend this string if an input url is not valid.
206 'auto' for elaborate guessing
207 encoding: Use this encoding instead of the system-specified.
208 extract_flat: Do not resolve URLs, return the immediate result.
209 Pass in 'in_playlist' to only show this behavior for
210 playlist items.
211 postprocessors: A list of dictionaries, each with an entry
212 * key: The name of the postprocessor. See
213 youtube_dl/postprocessor/__init__.py for a list.
214 as well as any further keyword arguments for the
215 postprocessor.
216 progress_hooks: A list of functions that get called on download
217 progress, with a dictionary with the entries
218 * status: One of "downloading", "error", or "finished".
219 Check this first and ignore unknown values.
220
221 If status is one of "downloading", or "finished", the
222 following properties may also be present:
223 * filename: The final filename (always present)
224 * tmpfilename: The filename we're currently writing to
225 * downloaded_bytes: Bytes on disk
226 * total_bytes: Size of the whole file, None if unknown
227 * total_bytes_estimate: Guess of the eventual file size,
228 None if unavailable.
229 * elapsed: The number of seconds since download started.
230 * eta: The estimated time in seconds, None if unknown
231 * speed: The download speed in bytes/second, None if
232 unknown
233 * fragment_index: The counter of the currently
234 downloaded video fragment.
235 * fragment_count: The number of fragments (= individual
236 files that will be merged)
237
238 Progress hooks are guaranteed to be called at least once
239 (with status "finished") if the download is successful.
240 merge_output_format: Extension to use when merging formats.
241 fixup: Automatically correct known faults of the file.
242 One of:
243 - "never": do nothing
244 - "warn": only emit a warning
245 - "detect_or_warn": check whether we can do anything
246 about it, warn otherwise (default)
247 source_address: (Experimental) Client-side IP address to bind to.
248 call_home: Boolean, true iff we are allowed to contact the
249 youtube-dl servers for debugging.
250 sleep_interval: Number of seconds to sleep before each download.
251 listformats: Print an overview of available video formats and exit.
252 list_thumbnails: Print a table of all thumbnails and exit.
253 match_filter: A function that gets called with the info_dict of
254 every video.
255 If it returns a message, the video is ignored.
256 If it returns None, the video is downloaded.
257 match_filter_func in utils.py is one example for this.
258 no_color: Do not emit color codes in output.
259
260 The following options determine which downloader is picked:
261 external_downloader: Executable of the external downloader to call.
262 None or unset for standard (built-in) downloader.
263 hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
264
265 The following parameters are not used by YoutubeDL itself, they are used by
266 the downloader (see youtube_dl/downloader/common.py):
267 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
268 noresizebuffer, retries, continuedl, noprogress, consoletitle,
269 xattr_set_filesize, external_downloader_args, hls_use_mpegts.
270
271 The following options are used by the post processors:
272 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
273 otherwise prefer avconv.
274 postprocessor_args: A list of additional command-line arguments for the
275 postprocessor.
276 """
277
278 params = None
279 _ies = []
280 _pps = []
281 _download_retcode = None
282 _num_downloads = None
283 _screen_file = None
284
285 def __init__(self, params=None, auto_init=True):
286 """Create a FileDownloader object with the given options."""
287 if params is None:
288 params = {}
289 self._ies = []
290 self._ies_instances = {}
291 self._pps = []
292 self._progress_hooks = []
293 self._download_retcode = 0
294 self._num_downloads = 0
295 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
296 self._err_file = sys.stderr
297 self.params = {
298 # Default parameters
299 'nocheckcertificate': False,
300 }
301 self.params.update(params)
302 self.cache = Cache(self)
303
304 if params.get('bidi_workaround', False):
305 try:
306 import pty
307 master, slave = pty.openpty()
308 width = compat_get_terminal_size().columns
309 if width is None:
310 width_args = []
311 else:
312 width_args = ['-w', str(width)]
313 sp_kwargs = dict(
314 stdin=subprocess.PIPE,
315 stdout=slave,
316 stderr=self._err_file)
317 try:
318 self._output_process = subprocess.Popen(
319 ['bidiv'] + width_args, **sp_kwargs
320 )
321 except OSError:
322 self._output_process = subprocess.Popen(
323 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
324 self._output_channel = os.fdopen(master, 'rb')
325 except OSError as ose:
326 if ose.errno == 2:
327 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
328 else:
329 raise
330
331 if (sys.version_info >= (3,) and sys.platform != 'win32' and
332 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
333 not params.get('restrictfilenames', False)):
334 # On Python 3, the Unicode filesystem API will throw errors (#1474)
335 self.report_warning(
336 'Assuming --restrict-filenames since file system encoding '
337 'cannot encode all characters. '
338 'Set the LC_ALL environment variable to fix this.')
339 self.params['restrictfilenames'] = True
340
341 if isinstance(params.get('outtmpl'), bytes):
342 self.report_warning(
343 'Parameter outtmpl is bytes, but should be a unicode string. '
344 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
345
346 self._setup_opener()
347
348 if auto_init:
349 self.print_debug_header()
350 self.add_default_info_extractors()
351
352 for pp_def_raw in self.params.get('postprocessors', []):
353 pp_class = get_postprocessor(pp_def_raw['key'])
354 pp_def = dict(pp_def_raw)
355 del pp_def['key']
356 pp = pp_class(self, **compat_kwargs(pp_def))
357 self.add_post_processor(pp)
358
359 for ph in self.params.get('progress_hooks', []):
360 self.add_progress_hook(ph)
361
362 def warn_if_short_id(self, argv):
363 # short YouTube ID starting with dash?
364 idxs = [
365 i for i, a in enumerate(argv)
366 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
367 if idxs:
368 correct_argv = (
369 ['youtube-dl'] +
370 [a for i, a in enumerate(argv) if i not in idxs] +
371 ['--'] + [argv[i] for i in idxs]
372 )
373 self.report_warning(
374 'Long argument string detected. '
375 'Use -- to separate parameters and URLs, like this:\n%s\n' %
376 args_to_str(correct_argv))
377
378 def add_info_extractor(self, ie):
379 """Add an InfoExtractor object to the end of the list."""
380 self._ies.append(ie)
381 self._ies_instances[ie.ie_key()] = ie
382 ie.set_downloader(self)
383
384 def get_info_extractor(self, ie_key):
385 """
386 Get an instance of an IE with name ie_key, it will try to get one from
387 the _ies list, if there's no instance it will create a new one and add
388 it to the extractor list.
389 """
390 ie = self._ies_instances.get(ie_key)
391 if ie is None:
392 ie = get_info_extractor(ie_key)()
393 self.add_info_extractor(ie)
394 return ie
395
396 def add_default_info_extractors(self):
397 """
398 Add the InfoExtractors returned by gen_extractors to the end of the list
399 """
400 for ie in gen_extractors():
401 self.add_info_extractor(ie)
402
403 def add_post_processor(self, pp):
404 """Add a PostProcessor object to the end of the chain."""
405 self._pps.append(pp)
406 pp.set_downloader(self)
407
408 def add_progress_hook(self, ph):
409 """Add the progress hook (currently only for the file downloader)"""
410 self._progress_hooks.append(ph)
411
412 def _bidi_workaround(self, message):
413 if not hasattr(self, '_output_channel'):
414 return message
415
416 assert hasattr(self, '_output_process')
417 assert isinstance(message, compat_str)
418 line_count = message.count('\n') + 1
419 self._output_process.stdin.write((message + '\n').encode('utf-8'))
420 self._output_process.stdin.flush()
421 res = ''.join(self._output_channel.readline().decode('utf-8')
422 for _ in range(line_count))
423 return res[:-len('\n')]
424
425 def to_screen(self, message, skip_eol=False):
426 """Print message to stdout if not in quiet mode."""
427 return self.to_stdout(message, skip_eol, check_quiet=True)
428
429 def _write_string(self, s, out=None):
430 write_string(s, out=out, encoding=self.params.get('encoding'))
431
432 def to_stdout(self, message, skip_eol=False, check_quiet=False):
433 """Print message to stdout if not in quiet mode."""
434 if self.params.get('logger'):
435 self.params['logger'].debug(message)
436 elif not check_quiet or not self.params.get('quiet', False):
437 message = self._bidi_workaround(message)
438 terminator = ['\n', ''][skip_eol]
439 output = message + terminator
440
441 self._write_string(output, self._screen_file)
442
443 def to_stderr(self, message):
444 """Print message to stderr."""
445 assert isinstance(message, compat_str)
446 if self.params.get('logger'):
447 self.params['logger'].error(message)
448 else:
449 message = self._bidi_workaround(message)
450 output = message + '\n'
451 self._write_string(output, self._err_file)
452
453 def to_console_title(self, message):
454 if not self.params.get('consoletitle', False):
455 return
456 if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
457 # c_wchar_p() might not be necessary if `message` is
458 # already of type unicode()
459 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
460 elif 'TERM' in os.environ:
461 self._write_string('\033]0;%s\007' % message, self._screen_file)
462
463 def save_console_title(self):
464 if not self.params.get('consoletitle', False):
465 return
466 if 'TERM' in os.environ:
467 # Save the title on stack
468 self._write_string('\033[22;0t', self._screen_file)
469
470 def restore_console_title(self):
471 if not self.params.get('consoletitle', False):
472 return
473 if 'TERM' in os.environ:
474 # Restore the title from stack
475 self._write_string('\033[23;0t', self._screen_file)
476
477 def __enter__(self):
478 self.save_console_title()
479 return self
480
481 def __exit__(self, *args):
482 self.restore_console_title()
483
484 if self.params.get('cookiefile') is not None:
485 self.cookiejar.save()
486
487 def trouble(self, message=None, tb=None):
488 """Determine action to take when a download problem appears.
489
490 Depending on if the downloader has been configured to ignore
491 download errors or not, this method may throw an exception or
492 not when errors are found, after printing the message.
493
494 tb, if given, is additional traceback information.
495 """
496 if message is not None:
497 self.to_stderr(message)
498 if self.params.get('verbose'):
499 if tb is None:
500 if sys.exc_info()[0]: # if .trouble has been called from an except block
501 tb = ''
502 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
503 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
504 tb += encode_compat_str(traceback.format_exc())
505 else:
506 tb_data = traceback.format_list(traceback.extract_stack())
507 tb = ''.join(tb_data)
508 self.to_stderr(tb)
509 if not self.params.get('ignoreerrors', False):
510 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
511 exc_info = sys.exc_info()[1].exc_info
512 else:
513 exc_info = sys.exc_info()
514 raise DownloadError(message, exc_info)
515 self._download_retcode = 1
516
517 def report_warning(self, message):
518 '''
519 Print the message to stderr, it will be prefixed with 'WARNING:'
520 If stderr is a tty file the 'WARNING:' will be colored
521 '''
522 if self.params.get('logger') is not None:
523 self.params['logger'].warning(message)
524 else:
525 if self.params.get('no_warnings'):
526 return
527 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
528 _msg_header = '\033[0;33mWARNING:\033[0m'
529 else:
530 _msg_header = 'WARNING:'
531 warning_message = '%s %s' % (_msg_header, message)
532 self.to_stderr(warning_message)
533
534 def report_error(self, message, tb=None):
535 '''
536 Do the same as trouble, but prefixes the message with 'ERROR:', colored
537 in red if stderr is a tty file.
538 '''
539 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
540 _msg_header = '\033[0;31mERROR:\033[0m'
541 else:
542 _msg_header = 'ERROR:'
543 error_message = '%s %s' % (_msg_header, message)
544 self.trouble(error_message, tb)
545
546 def report_file_already_downloaded(self, file_name):
547 """Report file has already been fully downloaded."""
548 try:
549 self.to_screen('[download] %s has already been downloaded' % file_name)
550 except UnicodeEncodeError:
551 self.to_screen('[download] The file has already been downloaded')
552
553 def prepare_filename(self, info_dict):
554 """Generate the output filename."""
555 try:
556 template_dict = dict(info_dict)
557
558 template_dict['epoch'] = int(time.time())
559 autonumber_size = self.params.get('autonumber_size')
560 if autonumber_size is None:
561 autonumber_size = 5
562 autonumber_templ = '%0' + str(autonumber_size) + 'd'
563 template_dict['autonumber'] = autonumber_templ % self._num_downloads
564 if template_dict.get('playlist_index') is not None:
565 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
566 if template_dict.get('resolution') is None:
567 if template_dict.get('width') and template_dict.get('height'):
568 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
569 elif template_dict.get('height'):
570 template_dict['resolution'] = '%sp' % template_dict['height']
571 elif template_dict.get('width'):
572 template_dict['resolution'] = '%dx?' % template_dict['width']
573
574 sanitize = lambda k, v: sanitize_filename(
575 compat_str(v),
576 restricted=self.params.get('restrictfilenames'),
577 is_id=(k == 'id'))
578 template_dict = dict((k, sanitize(k, v))
579 for k, v in template_dict.items()
580 if v is not None)
581 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
582
583 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
584 tmpl = compat_expanduser(outtmpl)
585 filename = tmpl % template_dict
586 # Temporary fix for #4787
587 # 'Treat' all problem characters by passing filename through preferredencoding
588 # to workaround encoding issues with subprocess on python2 @ Windows
589 if sys.version_info < (3, 0) and sys.platform == 'win32':
590 filename = encodeFilename(filename, True).decode(preferredencoding())
591 return sanitize_path(filename)
592 except ValueError as err:
593 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
594 return None
595
596 def _match_entry(self, info_dict, incomplete):
597 """ Returns None iff the file should be downloaded """
598
599 video_title = info_dict.get('title', info_dict.get('id', 'video'))
600 if 'title' in info_dict:
601 # This can happen when we're just evaluating the playlist
602 title = info_dict['title']
603 matchtitle = self.params.get('matchtitle', False)
604 if matchtitle:
605 if not re.search(matchtitle, title, re.IGNORECASE):
606 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
607 rejecttitle = self.params.get('rejecttitle', False)
608 if rejecttitle:
609 if re.search(rejecttitle, title, re.IGNORECASE):
610 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
611 date = info_dict.get('upload_date')
612 if date is not None:
613 dateRange = self.params.get('daterange', DateRange())
614 if date not in dateRange:
615 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
616 view_count = info_dict.get('view_count')
617 if view_count is not None:
618 min_views = self.params.get('min_views')
619 if min_views is not None and view_count < min_views:
620 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
621 max_views = self.params.get('max_views')
622 if max_views is not None and view_count > max_views:
623 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
624 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
625 return 'Skipping "%s" because it is age restricted' % video_title
626 if self.in_download_archive(info_dict):
627 return '%s has already been recorded in archive' % video_title
628
629 if not incomplete:
630 match_filter = self.params.get('match_filter')
631 if match_filter is not None:
632 ret = match_filter(info_dict)
633 if ret is not None:
634 return ret
635
636 return None
637
638 @staticmethod
639 def add_extra_info(info_dict, extra_info):
640 '''Set the keys from extra_info in info dict if they are missing'''
641 for key, value in extra_info.items():
642 info_dict.setdefault(key, value)
643
644 def extract_info(self, url, download=True, ie_key=None, extra_info={},
645 process=True, force_generic_extractor=False):
646 '''
647 Returns a list with a dictionary for each video we find.
648 If 'download', also downloads the videos.
649 extra_info is a dict containing the extra values to add to each result
650 '''
651
652 if not ie_key and force_generic_extractor:
653 ie_key = 'Generic'
654
655 if ie_key:
656 ies = [self.get_info_extractor(ie_key)]
657 else:
658 ies = self._ies
659
660 for ie in ies:
661 if not ie.suitable(url):
662 continue
663
664 if not ie.working():
665 self.report_warning('The program functionality for this site has been marked as broken, '
666 'and will probably not work.')
667
668 try:
669 ie_result = ie.extract(url)
670 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
671 break
672 if isinstance(ie_result, list):
673 # Backwards compatibility: old IE result format
674 ie_result = {
675 '_type': 'compat_list',
676 'entries': ie_result,
677 }
678 self.add_default_extra_info(ie_result, ie, url)
679 if process:
680 return self.process_ie_result(ie_result, download, extra_info)
681 else:
682 return ie_result
683 except ExtractorError as e: # An error we somewhat expected
684 self.report_error(compat_str(e), e.format_traceback())
685 break
686 except MaxDownloadsReached:
687 raise
688 except Exception as e:
689 if self.params.get('ignoreerrors', False):
690 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
691 break
692 else:
693 raise
694 else:
695 self.report_error('no suitable InfoExtractor for URL %s' % url)
696
697 def add_default_extra_info(self, ie_result, ie, url):
698 self.add_extra_info(ie_result, {
699 'extractor': ie.IE_NAME,
700 'webpage_url': url,
701 'webpage_url_basename': url_basename(url),
702 'extractor_key': ie.ie_key(),
703 })
704
705 def process_ie_result(self, ie_result, download=True, extra_info={}):
706 """
707 Take the result of the ie(may be modified) and resolve all unresolved
708 references (URLs, playlist items).
709
710 It will also download the videos if 'download'.
711 Returns the resolved ie_result.
712 """
713 result_type = ie_result.get('_type', 'video')
714
715 if result_type in ('url', 'url_transparent'):
716 extract_flat = self.params.get('extract_flat', False)
717 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
718 extract_flat is True):
719 if self.params.get('forcejson', False):
720 self.to_stdout(json.dumps(ie_result))
721 return ie_result
722
723 if result_type == 'video':
724 self.add_extra_info(ie_result, extra_info)
725 return self.process_video_result(ie_result, download=download)
726 elif result_type == 'url':
727 # We have to add extra_info to the results because it may be
728 # contained in a playlist
729 return self.extract_info(ie_result['url'],
730 download,
731 ie_key=ie_result.get('ie_key'),
732 extra_info=extra_info)
733 elif result_type == 'url_transparent':
734 # Use the information from the embedding page
735 info = self.extract_info(
736 ie_result['url'], ie_key=ie_result.get('ie_key'),
737 extra_info=extra_info, download=False, process=False)
738
739 force_properties = dict(
740 (k, v) for k, v in ie_result.items() if v is not None)
741 for f in ('_type', 'url', 'ie_key'):
742 if f in force_properties:
743 del force_properties[f]
744 new_result = info.copy()
745 new_result.update(force_properties)
746
747 assert new_result.get('_type') != 'url_transparent'
748
749 return self.process_ie_result(
750 new_result, download=download, extra_info=extra_info)
751 elif result_type == 'playlist' or result_type == 'multi_video':
752 # We process each entry in the playlist
753 playlist = ie_result.get('title') or ie_result.get('id')
754 self.to_screen('[download] Downloading playlist: %s' % playlist)
755
756 playlist_results = []
757
758 playliststart = self.params.get('playliststart', 1) - 1
759 playlistend = self.params.get('playlistend')
760 # For backwards compatibility, interpret -1 as whole list
761 if playlistend == -1:
762 playlistend = None
763
764 playlistitems_str = self.params.get('playlist_items')
765 playlistitems = None
766 if playlistitems_str is not None:
767 def iter_playlistitems(format):
768 for string_segment in format.split(','):
769 if '-' in string_segment:
770 start, end = string_segment.split('-')
771 for item in range(int(start), int(end) + 1):
772 yield int(item)
773 else:
774 yield int(string_segment)
775 playlistitems = iter_playlistitems(playlistitems_str)
776
777 ie_entries = ie_result['entries']
778 if isinstance(ie_entries, list):
779 n_all_entries = len(ie_entries)
780 if playlistitems:
781 entries = [
782 ie_entries[i - 1] for i in playlistitems
783 if -n_all_entries <= i - 1 < n_all_entries]
784 else:
785 entries = ie_entries[playliststart:playlistend]
786 n_entries = len(entries)
787 self.to_screen(
788 '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
789 (ie_result['extractor'], playlist, n_all_entries, n_entries))
790 elif isinstance(ie_entries, PagedList):
791 if playlistitems:
792 entries = []
793 for item in playlistitems:
794 entries.extend(ie_entries.getslice(
795 item - 1, item
796 ))
797 else:
798 entries = ie_entries.getslice(
799 playliststart, playlistend)
800 n_entries = len(entries)
801 self.to_screen(
802 '[%s] playlist %s: Downloading %d videos' %
803 (ie_result['extractor'], playlist, n_entries))
804 else: # iterable
805 if playlistitems:
806 entry_list = list(ie_entries)
807 entries = [entry_list[i - 1] for i in playlistitems]
808 else:
809 entries = list(itertools.islice(
810 ie_entries, playliststart, playlistend))
811 n_entries = len(entries)
812 self.to_screen(
813 '[%s] playlist %s: Downloading %d videos' %
814 (ie_result['extractor'], playlist, n_entries))
815
816 if self.params.get('playlistreverse', False):
817 entries = entries[::-1]
818
819 for i, entry in enumerate(entries, 1):
820 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
821 extra = {
822 'n_entries': n_entries,
823 'playlist': playlist,
824 'playlist_id': ie_result.get('id'),
825 'playlist_title': ie_result.get('title'),
826 'playlist_index': i + playliststart,
827 'extractor': ie_result['extractor'],
828 'webpage_url': ie_result['webpage_url'],
829 'webpage_url_basename': url_basename(ie_result['webpage_url']),
830 'extractor_key': ie_result['extractor_key'],
831 }
832
833 reason = self._match_entry(entry, incomplete=True)
834 if reason is not None:
835 self.to_screen('[download] ' + reason)
836 continue
837
838 entry_result = self.process_ie_result(entry,
839 download=download,
840 extra_info=extra)
841 playlist_results.append(entry_result)
842 ie_result['entries'] = playlist_results
843 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
844 return ie_result
845 elif result_type == 'compat_list':
846 self.report_warning(
847 'Extractor %s returned a compat_list result. '
848 'It needs to be updated.' % ie_result.get('extractor'))
849
850 def _fixup(r):
851 self.add_extra_info(
852 r,
853 {
854 'extractor': ie_result['extractor'],
855 'webpage_url': ie_result['webpage_url'],
856 'webpage_url_basename': url_basename(ie_result['webpage_url']),
857 'extractor_key': ie_result['extractor_key'],
858 }
859 )
860 return r
861 ie_result['entries'] = [
862 self.process_ie_result(_fixup(r), download, extra_info)
863 for r in ie_result['entries']
864 ]
865 return ie_result
866 else:
867 raise Exception('Invalid result type: %s' % result_type)
868
869 def _build_format_filter(self, filter_spec):
870 " Returns a function to filter the formats according to the filter_spec "
871
872 OPERATORS = {
873 '<': operator.lt,
874 '<=': operator.le,
875 '>': operator.gt,
876 '>=': operator.ge,
877 '=': operator.eq,
878 '!=': operator.ne,
879 }
880 operator_rex = re.compile(r'''(?x)\s*
881 (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
882 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
883 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
884 $
885 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
886 m = operator_rex.search(filter_spec)
887 if m:
888 try:
889 comparison_value = int(m.group('value'))
890 except ValueError:
891 comparison_value = parse_filesize(m.group('value'))
892 if comparison_value is None:
893 comparison_value = parse_filesize(m.group('value') + 'B')
894 if comparison_value is None:
895 raise ValueError(
896 'Invalid value %r in format specification %r' % (
897 m.group('value'), filter_spec))
898 op = OPERATORS[m.group('op')]
899
900 if not m:
901 STR_OPERATORS = {
902 '=': operator.eq,
903 '!=': operator.ne,
904 '^=': lambda attr, value: attr.startswith(value),
905 '$=': lambda attr, value: attr.endswith(value),
906 '*=': lambda attr, value: value in attr,
907 }
908 str_operator_rex = re.compile(r'''(?x)
909 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
910 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
911 \s*(?P<value>[a-zA-Z0-9._-]+)
912 \s*$
913 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
914 m = str_operator_rex.search(filter_spec)
915 if m:
916 comparison_value = m.group('value')
917 op = STR_OPERATORS[m.group('op')]
918
919 if not m:
920 raise ValueError('Invalid filter specification %r' % filter_spec)
921
922 def _filter(f):
923 actual_value = f.get(m.group('key'))
924 if actual_value is None:
925 return m.group('none_inclusive')
926 return op(actual_value, comparison_value)
927 return _filter
928
929 def build_format_selector(self, format_spec):
930 def syntax_error(note, start):
931 message = (
932 'Invalid format specification: '
933 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
934 return SyntaxError(message)
935
936 PICKFIRST = 'PICKFIRST'
937 MERGE = 'MERGE'
938 SINGLE = 'SINGLE'
939 GROUP = 'GROUP'
940 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
941
942 def _parse_filter(tokens):
943 filter_parts = []
944 for type, string, start, _, _ in tokens:
945 if type == tokenize.OP and string == ']':
946 return ''.join(filter_parts)
947 else:
948 filter_parts.append(string)
949
950 def _remove_unused_ops(tokens):
951 # Remove operators that we don't use and join them with the surrounding strings
952 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
953 ALLOWED_OPS = ('/', '+', ',', '(', ')')
954 last_string, last_start, last_end, last_line = None, None, None, None
955 for type, string, start, end, line in tokens:
956 if type == tokenize.OP and string == '[':
957 if last_string:
958 yield tokenize.NAME, last_string, last_start, last_end, last_line
959 last_string = None
960 yield type, string, start, end, line
961 # everything inside brackets will be handled by _parse_filter
962 for type, string, start, end, line in tokens:
963 yield type, string, start, end, line
964 if type == tokenize.OP and string == ']':
965 break
966 elif type == tokenize.OP and string in ALLOWED_OPS:
967 if last_string:
968 yield tokenize.NAME, last_string, last_start, last_end, last_line
969 last_string = None
970 yield type, string, start, end, line
971 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
972 if not last_string:
973 last_string = string
974 last_start = start
975 last_end = end
976 else:
977 last_string += string
978 if last_string:
979 yield tokenize.NAME, last_string, last_start, last_end, last_line
980
981 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
982 selectors = []
983 current_selector = None
984 for type, string, start, _, _ in tokens:
985 # ENCODING is only defined in python 3.x
986 if type == getattr(tokenize, 'ENCODING', None):
987 continue
988 elif type in [tokenize.NAME, tokenize.NUMBER]:
989 current_selector = FormatSelector(SINGLE, string, [])
990 elif type == tokenize.OP:
991 if string == ')':
992 if not inside_group:
993 # ')' will be handled by the parentheses group
994 tokens.restore_last_token()
995 break
996 elif inside_merge and string in ['/', ',']:
997 tokens.restore_last_token()
998 break
999 elif inside_choice and string == ',':
1000 tokens.restore_last_token()
1001 break
1002 elif string == ',':
1003 if not current_selector:
1004 raise syntax_error('"," must follow a format selector', start)
1005 selectors.append(current_selector)
1006 current_selector = None
1007 elif string == '/':
1008 if not current_selector:
1009 raise syntax_error('"/" must follow a format selector', start)
1010 first_choice = current_selector
1011 second_choice = _parse_format_selection(tokens, inside_choice=True)
1012 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1013 elif string == '[':
1014 if not current_selector:
1015 current_selector = FormatSelector(SINGLE, 'best', [])
1016 format_filter = _parse_filter(tokens)
1017 current_selector.filters.append(format_filter)
1018 elif string == '(':
1019 if current_selector:
1020 raise syntax_error('Unexpected "("', start)
1021 group = _parse_format_selection(tokens, inside_group=True)
1022 current_selector = FormatSelector(GROUP, group, [])
1023 elif string == '+':
1024 video_selector = current_selector
1025 audio_selector = _parse_format_selection(tokens, inside_merge=True)
1026 if not video_selector or not audio_selector:
1027 raise syntax_error('"+" must be between two format selectors', start)
1028 current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1029 else:
1030 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1031 elif type == tokenize.ENDMARKER:
1032 break
1033 if current_selector:
1034 selectors.append(current_selector)
1035 return selectors
1036
1037 def _build_selector_function(selector):
1038 if isinstance(selector, list):
1039 fs = [_build_selector_function(s) for s in selector]
1040
1041 def selector_function(formats):
1042 for f in fs:
1043 for format in f(formats):
1044 yield format
1045 return selector_function
1046 elif selector.type == GROUP:
1047 selector_function = _build_selector_function(selector.selector)
1048 elif selector.type == PICKFIRST:
1049 fs = [_build_selector_function(s) for s in selector.selector]
1050
1051 def selector_function(formats):
1052 for f in fs:
1053 picked_formats = list(f(formats))
1054 if picked_formats:
1055 return picked_formats
1056 return []
1057 elif selector.type == SINGLE:
1058 format_spec = selector.selector
1059
1060 def selector_function(formats):
1061 formats = list(formats)
1062 if not formats:
1063 return
1064 if format_spec == 'all':
1065 for f in formats:
1066 yield f
1067 elif format_spec in ['best', 'worst', None]:
1068 format_idx = 0 if format_spec == 'worst' else -1
1069 audiovideo_formats = [
1070 f for f in formats
1071 if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1072 if audiovideo_formats:
1073 yield audiovideo_formats[format_idx]
1074 # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
1075 elif (all(f.get('acodec') != 'none' for f in formats) or
1076 all(f.get('vcodec') != 'none' for f in formats)):
1077 yield formats[format_idx]
1078 elif format_spec == 'bestaudio':
1079 audio_formats = [
1080 f for f in formats
1081 if f.get('vcodec') == 'none']
1082 if audio_formats:
1083 yield audio_formats[-1]
1084 elif format_spec == 'worstaudio':
1085 audio_formats = [
1086 f for f in formats
1087 if f.get('vcodec') == 'none']
1088 if audio_formats:
1089 yield audio_formats[0]
1090 elif format_spec == 'bestvideo':
1091 video_formats = [
1092 f for f in formats
1093 if f.get('acodec') == 'none']
1094 if video_formats:
1095 yield video_formats[-1]
1096 elif format_spec == 'worstvideo':
1097 video_formats = [
1098 f for f in formats
1099 if f.get('acodec') == 'none']
1100 if video_formats:
1101 yield video_formats[0]
1102 else:
1103 extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1104 if format_spec in extensions:
1105 filter_f = lambda f: f['ext'] == format_spec
1106 else:
1107 filter_f = lambda f: f['format_id'] == format_spec
1108 matches = list(filter(filter_f, formats))
1109 if matches:
1110 yield matches[-1]
1111 elif selector.type == MERGE:
1112 def _merge(formats_info):
1113 format_1, format_2 = [f['format_id'] for f in formats_info]
1114 # The first format must contain the video and the
1115 # second the audio
1116 if formats_info[0].get('vcodec') == 'none':
1117 self.report_error('The first format must '
1118 'contain the video, try using '
1119 '"-f %s+%s"' % (format_2, format_1))
1120 return
1121 # Formats must be opposite (video+audio)
1122 if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1123 self.report_error(
1124 'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1125 % (format_1, format_2))
1126 return
1127 output_ext = (
1128 formats_info[0]['ext']
1129 if self.params.get('merge_output_format') is None
1130 else self.params['merge_output_format'])
1131 return {
1132 'requested_formats': formats_info,
1133 'format': '%s+%s' % (formats_info[0].get('format'),
1134 formats_info[1].get('format')),
1135 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1136 formats_info[1].get('format_id')),
1137 'width': formats_info[0].get('width'),
1138 'height': formats_info[0].get('height'),
1139 'resolution': formats_info[0].get('resolution'),
1140 'fps': formats_info[0].get('fps'),
1141 'vcodec': formats_info[0].get('vcodec'),
1142 'vbr': formats_info[0].get('vbr'),
1143 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1144 'acodec': formats_info[1].get('acodec'),
1145 'abr': formats_info[1].get('abr'),
1146 'ext': output_ext,
1147 }
1148 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1149
1150 def selector_function(formats):
1151 formats = list(formats)
1152 for pair in itertools.product(video_selector(formats), audio_selector(formats)):
1153 yield _merge(pair)
1154
1155 filters = [self._build_format_filter(f) for f in selector.filters]
1156
1157 def final_selector(formats):
1158 for _filter in filters:
1159 formats = list(filter(_filter, formats))
1160 return selector_function(formats)
1161 return final_selector
1162
1163 stream = io.BytesIO(format_spec.encode('utf-8'))
1164 try:
1165 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1166 except tokenize.TokenError:
1167 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1168
1169 class TokenIterator(object):
1170 def __init__(self, tokens):
1171 self.tokens = tokens
1172 self.counter = 0
1173
1174 def __iter__(self):
1175 return self
1176
1177 def __next__(self):
1178 if self.counter >= len(self.tokens):
1179 raise StopIteration()
1180 value = self.tokens[self.counter]
1181 self.counter += 1
1182 return value
1183
1184 next = __next__
1185
1186 def restore_last_token(self):
1187 self.counter -= 1
1188
1189 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1190 return _build_selector_function(parsed_selector)
1191
1192 def _calc_headers(self, info_dict):
1193 res = std_headers.copy()
1194
1195 add_headers = info_dict.get('http_headers')
1196 if add_headers:
1197 res.update(add_headers)
1198
1199 cookies = self._calc_cookies(info_dict)
1200 if cookies:
1201 res['Cookie'] = cookies
1202
1203 return res
1204
1205 def _calc_cookies(self, info_dict):
1206 pr = sanitized_Request(info_dict['url'])
1207 self.cookiejar.add_cookie_header(pr)
1208 return pr.get_header('Cookie')
1209
1210 def process_video_result(self, info_dict, download=True):
1211 assert info_dict.get('_type', 'video') == 'video'
1212
1213 if 'id' not in info_dict:
1214 raise ExtractorError('Missing "id" field in extractor result')
1215 if 'title' not in info_dict:
1216 raise ExtractorError('Missing "title" field in extractor result')
1217
1218 if 'playlist' not in info_dict:
1219 # It isn't part of a playlist
1220 info_dict['playlist'] = None
1221 info_dict['playlist_index'] = None
1222
1223 thumbnails = info_dict.get('thumbnails')
1224 if thumbnails is None:
1225 thumbnail = info_dict.get('thumbnail')
1226 if thumbnail:
1227 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1228 if thumbnails:
1229 thumbnails.sort(key=lambda t: (
1230 t.get('preference'), t.get('width'), t.get('height'),
1231 t.get('id'), t.get('url')))
1232 for i, t in enumerate(thumbnails):
1233 t['url'] = sanitize_url(t['url'])
1234 if t.get('width') and t.get('height'):
1235 t['resolution'] = '%dx%d' % (t['width'], t['height'])
1236 if t.get('id') is None:
1237 t['id'] = '%d' % i
1238
1239 if self.params.get('list_thumbnails'):
1240 self.list_thumbnails(info_dict)
1241 return
1242
1243 if thumbnails and 'thumbnail' not in info_dict:
1244 info_dict['thumbnail'] = thumbnails[-1]['url']
1245
1246 if 'display_id' not in info_dict and 'id' in info_dict:
1247 info_dict['display_id'] = info_dict['id']
1248
1249 if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1250 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1251 # see http://bugs.python.org/issue1646728)
1252 try:
1253 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1254 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1255 except (ValueError, OverflowError, OSError):
1256 pass
1257
1258 # Auto generate title fields corresponding to the *_number fields when missing
1259 # in order to always have clean titles. This is very common for TV series.
1260 for field in ('chapter', 'season', 'episode'):
1261 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1262 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1263
1264 subtitles = info_dict.get('subtitles')
1265 if subtitles:
1266 for _, subtitle in subtitles.items():
1267 for subtitle_format in subtitle:
1268 if subtitle_format.get('url'):
1269 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1270 if 'ext' not in subtitle_format:
1271 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1272
1273 if self.params.get('listsubtitles', False):
1274 if 'automatic_captions' in info_dict:
1275 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1276 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1277 return
1278 info_dict['requested_subtitles'] = self.process_subtitles(
1279 info_dict['id'], subtitles,
1280 info_dict.get('automatic_captions'))
1281
1282 # We now pick which formats have to be downloaded
1283 if info_dict.get('formats') is None:
1284 # There's only one format available
1285 formats = [info_dict]
1286 else:
1287 formats = info_dict['formats']
1288
1289 if not formats:
1290 raise ExtractorError('No video formats found!')
1291
1292 formats_dict = {}
1293
1294 # We check that all the formats have the format and format_id fields
1295 for i, format in enumerate(formats):
1296 if 'url' not in format:
1297 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1298
1299 format['url'] = sanitize_url(format['url'])
1300
1301 if format.get('format_id') is None:
1302 format['format_id'] = compat_str(i)
1303 else:
1304 # Sanitize format_id from characters used in format selector expression
1305 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1306 format_id = format['format_id']
1307 if format_id not in formats_dict:
1308 formats_dict[format_id] = []
1309 formats_dict[format_id].append(format)
1310
1311 # Make sure all formats have unique format_id
1312 for format_id, ambiguous_formats in formats_dict.items():
1313 if len(ambiguous_formats) > 1:
1314 for i, format in enumerate(ambiguous_formats):
1315 format['format_id'] = '%s-%d' % (format_id, i)
1316
1317 for i, format in enumerate(formats):
1318 if format.get('format') is None:
1319 format['format'] = '{id} - {res}{note}'.format(
1320 id=format['format_id'],
1321 res=self.format_resolution(format),
1322 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1323 )
1324 # Automatically determine file extension if missing
1325 if 'ext' not in format:
1326 format['ext'] = determine_ext(format['url']).lower()
1327 # Automatically determine protocol if missing (useful for format
1328 # selection purposes)
1329 if 'protocol' not in format:
1330 format['protocol'] = determine_protocol(format)
1331 # Add HTTP headers, so that external programs can use them from the
1332 # json output
1333 full_format_info = info_dict.copy()
1334 full_format_info.update(format)
1335 format['http_headers'] = self._calc_headers(full_format_info)
1336
1337 # TODO Central sorting goes here
1338
1339 if formats[0] is not info_dict:
1340 # only set the 'formats' fields if the original info_dict list them
1341 # otherwise we end up with a circular reference, the first (and unique)
1342 # element in the 'formats' field in info_dict is info_dict itself,
1343 # which can't be exported to json
1344 info_dict['formats'] = formats
1345 if self.params.get('listformats'):
1346 self.list_formats(info_dict)
1347 return
1348
1349 req_format = self.params.get('format')
1350 if req_format is None:
1351 req_format_list = []
1352 if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1353 not info_dict.get('is_live')):
1354 merger = FFmpegMergerPP(self)
1355 if merger.available and merger.can_merge():
1356 req_format_list.append('bestvideo+bestaudio')
1357 req_format_list.append('best')
1358 req_format = '/'.join(req_format_list)
1359 format_selector = self.build_format_selector(req_format)
1360 formats_to_download = list(format_selector(formats))
1361 if not formats_to_download:
1362 raise ExtractorError('requested format not available',
1363 expected=True)
1364
1365 if download:
1366 if len(formats_to_download) > 1:
1367 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1368 for format in formats_to_download:
1369 new_info = dict(info_dict)
1370 new_info.update(format)
1371 self.process_info(new_info)
1372 # We update the info dict with the best quality format (backwards compatibility)
1373 info_dict.update(formats_to_download[-1])
1374 return info_dict
1375
1376 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1377 """Select the requested subtitles and their format"""
1378 available_subs = {}
1379 if normal_subtitles and self.params.get('writesubtitles'):
1380 available_subs.update(normal_subtitles)
1381 if automatic_captions and self.params.get('writeautomaticsub'):
1382 for lang, cap_info in automatic_captions.items():
1383 if lang not in available_subs:
1384 available_subs[lang] = cap_info
1385
1386 if (not self.params.get('writesubtitles') and not
1387 self.params.get('writeautomaticsub') or not
1388 available_subs):
1389 return None
1390
1391 if self.params.get('allsubtitles', False):
1392 requested_langs = available_subs.keys()
1393 else:
1394 if self.params.get('subtitleslangs', False):
1395 requested_langs = self.params.get('subtitleslangs')
1396 elif 'en' in available_subs:
1397 requested_langs = ['en']
1398 else:
1399 requested_langs = [list(available_subs.keys())[0]]
1400
1401 formats_query = self.params.get('subtitlesformat', 'best')
1402 formats_preference = formats_query.split('/') if formats_query else []
1403 subs = {}
1404 for lang in requested_langs:
1405 formats = available_subs.get(lang)
1406 if formats is None:
1407 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1408 continue
1409 for ext in formats_preference:
1410 if ext == 'best':
1411 f = formats[-1]
1412 break
1413 matches = list(filter(lambda f: f['ext'] == ext, formats))
1414 if matches:
1415 f = matches[-1]
1416 break
1417 else:
1418 f = formats[-1]
1419 self.report_warning(
1420 'No subtitle format found matching "%s" for language %s, '
1421 'using %s' % (formats_query, lang, f['ext']))
1422 subs[lang] = f
1423 return subs
1424
1425 def process_info(self, info_dict):
1426 """Process a single resolved IE result."""
1427
1428 assert info_dict.get('_type', 'video') == 'video'
1429
1430 max_downloads = self.params.get('max_downloads')
1431 if max_downloads is not None:
1432 if self._num_downloads >= int(max_downloads):
1433 raise MaxDownloadsReached()
1434
1435 info_dict['fulltitle'] = info_dict['title']
1436 if len(info_dict['title']) > 200:
1437 info_dict['title'] = info_dict['title'][:197] + '...'
1438
1439 if 'format' not in info_dict:
1440 info_dict['format'] = info_dict['ext']
1441
1442 reason = self._match_entry(info_dict, incomplete=False)
1443 if reason is not None:
1444 self.to_screen('[download] ' + reason)
1445 return
1446
1447 self._num_downloads += 1
1448
1449 info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1450
1451 # Forced printings
1452 if self.params.get('forcetitle', False):
1453 self.to_stdout(info_dict['fulltitle'])
1454 if self.params.get('forceid', False):
1455 self.to_stdout(info_dict['id'])
1456 if self.params.get('forceurl', False):
1457 if info_dict.get('requested_formats') is not None:
1458 for f in info_dict['requested_formats']:
1459 self.to_stdout(f['url'] + f.get('play_path', ''))
1460 else:
1461 # For RTMP URLs, also include the playpath
1462 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1463 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1464 self.to_stdout(info_dict['thumbnail'])
1465 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1466 self.to_stdout(info_dict['description'])
1467 if self.params.get('forcefilename', False) and filename is not None:
1468 self.to_stdout(filename)
1469 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1470 self.to_stdout(formatSeconds(info_dict['duration']))
1471 if self.params.get('forceformat', False):
1472 self.to_stdout(info_dict['format'])
1473 if self.params.get('forcejson', False):
1474 self.to_stdout(json.dumps(info_dict))
1475
1476 # Do nothing else if in simulate mode
1477 if self.params.get('simulate', False):
1478 return
1479
1480 if filename is None:
1481 return
1482
1483 try:
1484 dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1485 if dn and not os.path.exists(dn):
1486 os.makedirs(dn)
1487 except (OSError, IOError) as err:
1488 self.report_error('unable to create directory ' + error_to_compat_str(err))
1489 return
1490
1491 if self.params.get('writedescription', False):
1492 descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1493 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1494 self.to_screen('[info] Video description is already present')
1495 elif info_dict.get('description') is None:
1496 self.report_warning('There\'s no description to write.')
1497 else:
1498 try:
1499 self.to_screen('[info] Writing video description to: ' + descfn)
1500 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1501 descfile.write(info_dict['description'])
1502 except (OSError, IOError):
1503 self.report_error('Cannot write description file ' + descfn)
1504 return
1505
1506 if self.params.get('writeannotations', False):
1507 annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1508 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1509 self.to_screen('[info] Video annotations are already present')
1510 else:
1511 try:
1512 self.to_screen('[info] Writing video annotations to: ' + annofn)
1513 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1514 annofile.write(info_dict['annotations'])
1515 except (KeyError, TypeError):
1516 self.report_warning('There are no annotations to write.')
1517 except (OSError, IOError):
1518 self.report_error('Cannot write annotations file: ' + annofn)
1519 return
1520
1521 subtitles_are_requested = any([self.params.get('writesubtitles', False),
1522 self.params.get('writeautomaticsub')])
1523
1524 if subtitles_are_requested and info_dict.get('requested_subtitles'):
1525 # subtitles download errors are already managed as troubles in relevant IE
1526 # that way it will silently go on when used with unsupporting IE
1527 subtitles = info_dict['requested_subtitles']
1528 ie = self.get_info_extractor(info_dict['extractor_key'])
1529 for sub_lang, sub_info in subtitles.items():
1530 sub_format = sub_info['ext']
1531 if sub_info.get('data') is not None:
1532 sub_data = sub_info['data']
1533 else:
1534 try:
1535 sub_data = ie._download_webpage(
1536 sub_info['url'], info_dict['id'], note=False)
1537 except ExtractorError as err:
1538 self.report_warning('Unable to download subtitle for "%s": %s' %
1539 (sub_lang, error_to_compat_str(err.cause)))
1540 continue
1541 try:
1542 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1543 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1544 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1545 else:
1546 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1547 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1548 subfile.write(sub_data)
1549 except (OSError, IOError):
1550 self.report_error('Cannot write subtitles file ' + sub_filename)
1551 return
1552
1553 if self.params.get('writeinfojson', False):
1554 infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1555 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1556 self.to_screen('[info] Video description metadata is already present')
1557 else:
1558 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1559 try:
1560 write_json_file(self.filter_requested_info(info_dict), infofn)
1561 except (OSError, IOError):
1562 self.report_error('Cannot write metadata to JSON file ' + infofn)
1563 return
1564
1565 self._write_thumbnails(info_dict, filename)
1566
1567 if not self.params.get('skip_download', False):
1568 try:
1569 def dl(name, info):
1570 fd = get_suitable_downloader(info, self.params)(self, self.params)
1571 for ph in self._progress_hooks:
1572 fd.add_progress_hook(ph)
1573 if self.params.get('verbose'):
1574 self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1575 return fd.download(name, info)
1576
1577 if info_dict.get('requested_formats') is not None:
1578 downloaded = []
1579 success = True
1580 merger = FFmpegMergerPP(self)
1581 if not merger.available:
1582 postprocessors = []
1583 self.report_warning('You have requested multiple '
1584 'formats but ffmpeg or avconv are not installed.'
1585 ' The formats won\'t be merged.')
1586 else:
1587 postprocessors = [merger]
1588
1589 def compatible_formats(formats):
1590 video, audio = formats
1591 # Check extension
1592 video_ext, audio_ext = audio.get('ext'), video.get('ext')
1593 if video_ext and audio_ext:
1594 COMPATIBLE_EXTS = (
1595 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1596 ('webm')
1597 )
1598 for exts in COMPATIBLE_EXTS:
1599 if video_ext in exts and audio_ext in exts:
1600 return True
1601 # TODO: Check acodec/vcodec
1602 return False
1603
1604 filename_real_ext = os.path.splitext(filename)[1][1:]
1605 filename_wo_ext = (
1606 os.path.splitext(filename)[0]
1607 if filename_real_ext == info_dict['ext']
1608 else filename)
1609 requested_formats = info_dict['requested_formats']
1610 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1611 info_dict['ext'] = 'mkv'
1612 self.report_warning(
1613 'Requested formats are incompatible for merge and will be merged into mkv.')
1614 # Ensure filename always has a correct extension for successful merge
1615 filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1616 if os.path.exists(encodeFilename(filename)):
1617 self.to_screen(
1618 '[download] %s has already been downloaded and '
1619 'merged' % filename)
1620 else:
1621 for f in requested_formats:
1622 new_info = dict(info_dict)
1623 new_info.update(f)
1624 fname = self.prepare_filename(new_info)
1625 fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1626 downloaded.append(fname)
1627 partial_success = dl(fname, new_info)
1628 success = success and partial_success
1629 info_dict['__postprocessors'] = postprocessors
1630 info_dict['__files_to_merge'] = downloaded
1631 else:
1632 # Just a single file
1633 success = dl(filename, info_dict)
1634 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1635 self.report_error('unable to download video data: %s' % str(err))
1636 return
1637 except (OSError, IOError) as err:
1638 raise UnavailableVideoError(err)
1639 except (ContentTooShortError, ) as err:
1640 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1641 return
1642
1643 if success and filename != '-':
1644 # Fixup content
1645 fixup_policy = self.params.get('fixup')
1646 if fixup_policy is None:
1647 fixup_policy = 'detect_or_warn'
1648
1649 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1650
1651 stretched_ratio = info_dict.get('stretched_ratio')
1652 if stretched_ratio is not None and stretched_ratio != 1:
1653 if fixup_policy == 'warn':
1654 self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1655 info_dict['id'], stretched_ratio))
1656 elif fixup_policy == 'detect_or_warn':
1657 stretched_pp = FFmpegFixupStretchedPP(self)
1658 if stretched_pp.available:
1659 info_dict.setdefault('__postprocessors', [])
1660 info_dict['__postprocessors'].append(stretched_pp)
1661 else:
1662 self.report_warning(
1663 '%s: Non-uniform pixel ratio (%s). %s'
1664 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1665 else:
1666 assert fixup_policy in ('ignore', 'never')
1667
1668 if (info_dict.get('requested_formats') is None and
1669 info_dict.get('container') == 'm4a_dash'):
1670 if fixup_policy == 'warn':
1671 self.report_warning(
1672 '%s: writing DASH m4a. '
1673 'Only some players support this container.'
1674 % info_dict['id'])
1675 elif fixup_policy == 'detect_or_warn':
1676 fixup_pp = FFmpegFixupM4aPP(self)
1677 if fixup_pp.available:
1678 info_dict.setdefault('__postprocessors', [])
1679 info_dict['__postprocessors'].append(fixup_pp)
1680 else:
1681 self.report_warning(
1682 '%s: writing DASH m4a. '
1683 'Only some players support this container. %s'
1684 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1685 else:
1686 assert fixup_policy in ('ignore', 'never')
1687
1688 if (info_dict.get('protocol') == 'm3u8_native' or
1689 info_dict.get('protocol') == 'm3u8' and
1690 self.params.get('hls_prefer_native')):
1691 if fixup_policy == 'warn':
1692 self.report_warning('%s: malformated aac bitstream.' % (
1693 info_dict['id']))
1694 elif fixup_policy == 'detect_or_warn':
1695 fixup_pp = FFmpegFixupM3u8PP(self)
1696 if fixup_pp.available:
1697 info_dict.setdefault('__postprocessors', [])
1698 info_dict['__postprocessors'].append(fixup_pp)
1699 else:
1700 self.report_warning(
1701 '%s: malformated aac bitstream. %s'
1702 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1703 else:
1704 assert fixup_policy in ('ignore', 'never')
1705
1706 try:
1707 self.post_process(filename, info_dict)
1708 except (PostProcessingError) as err:
1709 self.report_error('postprocessing: %s' % str(err))
1710 return
1711 self.record_download_archive(info_dict)
1712
1713 def download(self, url_list):
1714 """Download a given list of URLs."""
1715 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1716 if (len(url_list) > 1 and
1717 '%' not in outtmpl and
1718 self.params.get('max_downloads') != 1):
1719 raise SameFileError(outtmpl)
1720
1721 for url in url_list:
1722 try:
1723 # It also downloads the videos
1724 res = self.extract_info(
1725 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1726 except UnavailableVideoError:
1727 self.report_error('unable to download video')
1728 except MaxDownloadsReached:
1729 self.to_screen('[info] Maximum number of downloaded files reached.')
1730 raise
1731 else:
1732 if self.params.get('dump_single_json', False):
1733 self.to_stdout(json.dumps(res))
1734
1735 return self._download_retcode
1736
1737 def download_with_info_file(self, info_filename):
1738 with contextlib.closing(fileinput.FileInput(
1739 [info_filename], mode='r',
1740 openhook=fileinput.hook_encoded('utf-8'))) as f:
1741 # FileInput doesn't have a read method, we can't call json.load
1742 info = self.filter_requested_info(json.loads('\n'.join(f)))
1743 try:
1744 self.process_ie_result(info, download=True)
1745 except DownloadError:
1746 webpage_url = info.get('webpage_url')
1747 if webpage_url is not None:
1748 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1749 return self.download([webpage_url])
1750 else:
1751 raise
1752 return self._download_retcode
1753
1754 @staticmethod
1755 def filter_requested_info(info_dict):
1756 return dict(
1757 (k, v) for k, v in info_dict.items()
1758 if k not in ['requested_formats', 'requested_subtitles'])
1759
1760 def post_process(self, filename, ie_info):
1761 """Run all the postprocessors on the given file."""
1762 info = dict(ie_info)
1763 info['filepath'] = filename
1764 pps_chain = []
1765 if ie_info.get('__postprocessors') is not None:
1766 pps_chain.extend(ie_info['__postprocessors'])
1767 pps_chain.extend(self._pps)
1768 for pp in pps_chain:
1769 files_to_delete = []
1770 try:
1771 files_to_delete, info = pp.run(info)
1772 except PostProcessingError as e:
1773 self.report_error(e.msg)
1774 if files_to_delete and not self.params.get('keepvideo', False):
1775 for old_filename in files_to_delete:
1776 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1777 try:
1778 os.remove(encodeFilename(old_filename))
1779 except (IOError, OSError):
1780 self.report_warning('Unable to remove downloaded original file')
1781
1782 def _make_archive_id(self, info_dict):
1783 # Future-proof against any change in case
1784 # and backwards compatibility with prior versions
1785 extractor = info_dict.get('extractor_key')
1786 if extractor is None:
1787 if 'id' in info_dict:
1788 extractor = info_dict.get('ie_key') # key in a playlist
1789 if extractor is None:
1790 return None # Incomplete video information
1791 return extractor.lower() + ' ' + info_dict['id']
1792
1793 def in_download_archive(self, info_dict):
1794 fn = self.params.get('download_archive')
1795 if fn is None:
1796 return False
1797
1798 vid_id = self._make_archive_id(info_dict)
1799 if vid_id is None:
1800 return False # Incomplete video information
1801
1802 try:
1803 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1804 for line in archive_file:
1805 if line.strip() == vid_id:
1806 return True
1807 except IOError as ioe:
1808 if ioe.errno != errno.ENOENT:
1809 raise
1810 return False
1811
1812 def record_download_archive(self, info_dict):
1813 fn = self.params.get('download_archive')
1814 if fn is None:
1815 return
1816 vid_id = self._make_archive_id(info_dict)
1817 assert vid_id
1818 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1819 archive_file.write(vid_id + '\n')
1820
1821 @staticmethod
1822 def format_resolution(format, default='unknown'):
1823 if format.get('vcodec') == 'none':
1824 return 'audio only'
1825 if format.get('resolution') is not None:
1826 return format['resolution']
1827 if format.get('height') is not None:
1828 if format.get('width') is not None:
1829 res = '%sx%s' % (format['width'], format['height'])
1830 else:
1831 res = '%sp' % format['height']
1832 elif format.get('width') is not None:
1833 res = '%dx?' % format['width']
1834 else:
1835 res = default
1836 return res
1837
1838 def _format_note(self, fdict):
1839 res = ''
1840 if fdict.get('ext') in ['f4f', 'f4m']:
1841 res += '(unsupported) '
1842 if fdict.get('language'):
1843 if res:
1844 res += ' '
1845 res += '[%s] ' % fdict['language']
1846 if fdict.get('format_note') is not None:
1847 res += fdict['format_note'] + ' '
1848 if fdict.get('tbr') is not None:
1849 res += '%4dk ' % fdict['tbr']
1850 if fdict.get('container') is not None:
1851 if res:
1852 res += ', '
1853 res += '%s container' % fdict['container']
1854 if (fdict.get('vcodec') is not None and
1855 fdict.get('vcodec') != 'none'):
1856 if res:
1857 res += ', '
1858 res += fdict['vcodec']
1859 if fdict.get('vbr') is not None:
1860 res += '@'
1861 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1862 res += 'video@'
1863 if fdict.get('vbr') is not None:
1864 res += '%4dk' % fdict['vbr']
1865 if fdict.get('fps') is not None:
1866 if res:
1867 res += ', '
1868 res += '%sfps' % fdict['fps']
1869 if fdict.get('acodec') is not None:
1870 if res:
1871 res += ', '
1872 if fdict['acodec'] == 'none':
1873 res += 'video only'
1874 else:
1875 res += '%-5s' % fdict['acodec']
1876 elif fdict.get('abr') is not None:
1877 if res:
1878 res += ', '
1879 res += 'audio'
1880 if fdict.get('abr') is not None:
1881 res += '@%3dk' % fdict['abr']
1882 if fdict.get('asr') is not None:
1883 res += ' (%5dHz)' % fdict['asr']
1884 if fdict.get('filesize') is not None:
1885 if res:
1886 res += ', '
1887 res += format_bytes(fdict['filesize'])
1888 elif fdict.get('filesize_approx') is not None:
1889 if res:
1890 res += ', '
1891 res += '~' + format_bytes(fdict['filesize_approx'])
1892 return res
1893
1894 def list_formats(self, info_dict):
1895 formats = info_dict.get('formats', [info_dict])
1896 table = [
1897 [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1898 for f in formats
1899 if f.get('preference') is None or f['preference'] >= -1000]
1900 if len(formats) > 1:
1901 table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1902
1903 header_line = ['format code', 'extension', 'resolution', 'note']
1904 self.to_screen(
1905 '[info] Available formats for %s:\n%s' %
1906 (info_dict['id'], render_table(header_line, table)))
1907
1908 def list_thumbnails(self, info_dict):
1909 thumbnails = info_dict.get('thumbnails')
1910 if not thumbnails:
1911 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1912 return
1913
1914 self.to_screen(
1915 '[info] Thumbnails for %s:' % info_dict['id'])
1916 self.to_screen(render_table(
1917 ['ID', 'width', 'height', 'URL'],
1918 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1919
1920 def list_subtitles(self, video_id, subtitles, name='subtitles'):
1921 if not subtitles:
1922 self.to_screen('%s has no %s' % (video_id, name))
1923 return
1924 self.to_screen(
1925 'Available %s for %s:' % (name, video_id))
1926 self.to_screen(render_table(
1927 ['Language', 'formats'],
1928 [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1929 for lang, formats in subtitles.items()]))
1930
1931 def urlopen(self, req):
1932 """ Start an HTTP download """
1933 if isinstance(req, compat_basestring):
1934 req = sanitized_Request(req)
1935 return self._opener.open(req, timeout=self._socket_timeout)
1936
1937 def print_debug_header(self):
1938 if not self.params.get('verbose'):
1939 return
1940
1941 if type('') is not compat_str:
1942 # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1943 self.report_warning(
1944 'Your Python is broken! Update to a newer and supported version')
1945
1946 stdout_encoding = getattr(
1947 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1948 encoding_str = (
1949 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1950 locale.getpreferredencoding(),
1951 sys.getfilesystemencoding(),
1952 stdout_encoding,
1953 self.get_encoding()))
1954 write_string(encoding_str, encoding=None)
1955
1956 self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1957 try:
1958 sp = subprocess.Popen(
1959 ['git', 'rev-parse', '--short', 'HEAD'],
1960 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1961 cwd=os.path.dirname(os.path.abspath(__file__)))
1962 out, err = sp.communicate()
1963 out = out.decode().strip()
1964 if re.match('[0-9a-f]+', out):
1965 self._write_string('[debug] Git HEAD: ' + out + '\n')
1966 except Exception:
1967 try:
1968 sys.exc_clear()
1969 except Exception:
1970 pass
1971 self._write_string('[debug] Python version %s - %s\n' % (
1972 platform.python_version(), platform_name()))
1973
1974 exe_versions = FFmpegPostProcessor.get_versions(self)
1975 exe_versions['rtmpdump'] = rtmpdump_version()
1976 exe_str = ', '.join(
1977 '%s %s' % (exe, v)
1978 for exe, v in sorted(exe_versions.items())
1979 if v
1980 )
1981 if not exe_str:
1982 exe_str = 'none'
1983 self._write_string('[debug] exe versions: %s\n' % exe_str)
1984
1985 proxy_map = {}
1986 for handler in self._opener.handlers:
1987 if hasattr(handler, 'proxies'):
1988 proxy_map.update(handler.proxies)
1989 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1990
1991 if self.params.get('call_home', False):
1992 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
1993 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
1994 latest_version = self.urlopen(
1995 'https://yt-dl.org/latest/version').read().decode('utf-8')
1996 if version_tuple(latest_version) > version_tuple(__version__):
1997 self.report_warning(
1998 'You are using an outdated version (newest version: %s)! '
1999 'See https://yt-dl.org/update if you need help updating.' %
2000 latest_version)
2001
2002 def _setup_opener(self):
2003 timeout_val = self.params.get('socket_timeout')
2004 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2005
2006 opts_cookiefile = self.params.get('cookiefile')
2007 opts_proxy = self.params.get('proxy')
2008
2009 if opts_cookiefile is None:
2010 self.cookiejar = compat_cookiejar.CookieJar()
2011 else:
2012 self.cookiejar = compat_cookiejar.MozillaCookieJar(
2013 opts_cookiefile)
2014 if os.access(opts_cookiefile, os.R_OK):
2015 self.cookiejar.load()
2016
2017 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2018 if opts_proxy is not None:
2019 if opts_proxy == '':
2020 proxies = {}
2021 else:
2022 proxies = {'http': opts_proxy, 'https': opts_proxy}
2023 else:
2024 proxies = compat_urllib_request.getproxies()
2025 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2026 if 'http' in proxies and 'https' not in proxies:
2027 proxies['https'] = proxies['http']
2028 proxy_handler = PerRequestProxyHandler(proxies)
2029
2030 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2031 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2032 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2033 data_handler = compat_urllib_request_DataHandler()
2034
2035 # When passing our own FileHandler instance, build_opener won't add the
2036 # default FileHandler and allows us to disable the file protocol, which
2037 # can be used for malicious purposes (see
2038 # https://github.com/rg3/youtube-dl/issues/8227)
2039 file_handler = compat_urllib_request.FileHandler()
2040
2041 def file_open(*args, **kwargs):
2042 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2043 file_handler.file_open = file_open
2044
2045 opener = compat_urllib_request.build_opener(
2046 proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2047
2048 # Delete the default user-agent header, which would otherwise apply in
2049 # cases where our custom HTTP handler doesn't come into play
2050 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2051 opener.addheaders = []
2052 self._opener = opener
2053
2054 def encode(self, s):
2055 if isinstance(s, bytes):
2056 return s # Already encoded
2057
2058 try:
2059 return s.encode(self.get_encoding())
2060 except UnicodeEncodeError as err:
2061 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2062 raise
2063
2064 def get_encoding(self):
2065 encoding = self.params.get('encoding')
2066 if encoding is None:
2067 encoding = preferredencoding()
2068 return encoding
2069
2070 def _write_thumbnails(self, info_dict, filename):
2071 if self.params.get('writethumbnail', False):
2072 thumbnails = info_dict.get('thumbnails')
2073 if thumbnails:
2074 thumbnails = [thumbnails[-1]]
2075 elif self.params.get('write_all_thumbnails', False):
2076 thumbnails = info_dict.get('thumbnails')
2077 else:
2078 return
2079
2080 if not thumbnails:
2081 # No thumbnails present, so return immediately
2082 return
2083
2084 for t in thumbnails:
2085 thumb_ext = determine_ext(t['url'], 'jpg')
2086 suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2087 thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2088 t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2089
2090 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2091 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2092 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2093 else:
2094 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2095 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2096 try:
2097 uf = self.urlopen(t['url'])
2098 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2099 shutil.copyfileobj(uf, thumbf)
2100 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2101 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2102 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2103 self.report_warning('Unable to download thumbnail "%s": %s' %
2104 (t['url'], error_to_compat_str(err)))