]> jfr.im git - yt-dlp.git/blob - youtube_dl/YoutubeDL.py
Merge pull request #8497 from jaimeMF/lazy-load
[yt-dlp.git] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import datetime
9 import errno
10 import fileinput
11 import io
12 import itertools
13 import json
14 import locale
15 import operator
16 import os
17 import platform
18 import re
19 import shutil
20 import subprocess
21 import socket
22 import sys
23 import time
24 import tokenize
25 import traceback
26
27 from .compat import (
28 compat_basestring,
29 compat_cookiejar,
30 compat_expanduser,
31 compat_get_terminal_size,
32 compat_http_client,
33 compat_kwargs,
34 compat_os_name,
35 compat_str,
36 compat_tokenize_tokenize,
37 compat_urllib_error,
38 compat_urllib_request,
39 compat_urllib_request_DataHandler,
40 )
41 from .utils import (
42 age_restricted,
43 args_to_str,
44 ContentTooShortError,
45 date_from_str,
46 DateRange,
47 DEFAULT_OUTTMPL,
48 determine_ext,
49 determine_protocol,
50 DownloadError,
51 encode_compat_str,
52 encodeFilename,
53 error_to_compat_str,
54 ExtractorError,
55 format_bytes,
56 formatSeconds,
57 locked_file,
58 make_HTTPS_handler,
59 MaxDownloadsReached,
60 PagedList,
61 parse_filesize,
62 PerRequestProxyHandler,
63 platform_name,
64 PostProcessingError,
65 preferredencoding,
66 prepend_extension,
67 render_table,
68 replace_extension,
69 SameFileError,
70 sanitize_filename,
71 sanitize_path,
72 sanitize_url,
73 sanitized_Request,
74 std_headers,
75 subtitles_filename,
76 UnavailableVideoError,
77 url_basename,
78 version_tuple,
79 write_json_file,
80 write_string,
81 YoutubeDLCookieProcessor,
82 YoutubeDLHandler,
83 )
84 from .cache import Cache
85 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
86 from .downloader import get_suitable_downloader
87 from .downloader.rtmp import rtmpdump_version
88 from .postprocessor import (
89 FFmpegFixupM3u8PP,
90 FFmpegFixupM4aPP,
91 FFmpegFixupStretchedPP,
92 FFmpegMergerPP,
93 FFmpegPostProcessor,
94 get_postprocessor,
95 )
96 from .version import __version__
97
98 if compat_os_name == 'nt':
99 import ctypes
100
101
102 class YoutubeDL(object):
103 """YoutubeDL class.
104
105 YoutubeDL objects are the ones responsible of downloading the
106 actual video file and writing it to disk if the user has requested
107 it, among some other tasks. In most cases there should be one per
108 program. As, given a video URL, the downloader doesn't know how to
109 extract all the needed information, task that InfoExtractors do, it
110 has to pass the URL to one of them.
111
112 For this, YoutubeDL objects have a method that allows
113 InfoExtractors to be registered in a given order. When it is passed
114 a URL, the YoutubeDL object handles it to the first InfoExtractor it
115 finds that reports being able to handle it. The InfoExtractor extracts
116 all the information about the video or videos the URL refers to, and
117 YoutubeDL process the extracted information, possibly using a File
118 Downloader to download the video.
119
120 YoutubeDL objects accept a lot of parameters. In order not to saturate
121 the object constructor with arguments, it receives a dictionary of
122 options instead. These options are available through the params
123 attribute for the InfoExtractors to use. The YoutubeDL also
124 registers itself as the downloader in charge for the InfoExtractors
125 that are added to it, so this is a "mutual registration".
126
127 Available options:
128
129 username: Username for authentication purposes.
130 password: Password for authentication purposes.
131 videopassword: Password for accessing a video.
132 usenetrc: Use netrc for authentication instead.
133 verbose: Print additional info to stdout.
134 quiet: Do not print messages to stdout.
135 no_warnings: Do not print out anything for warnings.
136 forceurl: Force printing final URL.
137 forcetitle: Force printing title.
138 forceid: Force printing ID.
139 forcethumbnail: Force printing thumbnail URL.
140 forcedescription: Force printing description.
141 forcefilename: Force printing final filename.
142 forceduration: Force printing duration.
143 forcejson: Force printing info_dict as JSON.
144 dump_single_json: Force printing the info_dict of the whole playlist
145 (or video) as a single JSON line.
146 simulate: Do not download the video files.
147 format: Video format code. See options.py for more information.
148 outtmpl: Template for output names.
149 restrictfilenames: Do not allow "&" and spaces in file names
150 ignoreerrors: Do not stop on download errors.
151 force_generic_extractor: Force downloader to use the generic extractor
152 nooverwrites: Prevent overwriting files.
153 playliststart: Playlist item to start at.
154 playlistend: Playlist item to end at.
155 playlist_items: Specific indices of playlist to download.
156 playlistreverse: Download playlist items in reverse order.
157 matchtitle: Download only matching titles.
158 rejecttitle: Reject downloads for matching titles.
159 logger: Log messages to a logging.Logger instance.
160 logtostderr: Log messages to stderr instead of stdout.
161 writedescription: Write the video description to a .description file
162 writeinfojson: Write the video description to a .info.json file
163 writeannotations: Write the video annotations to a .annotations.xml file
164 writethumbnail: Write the thumbnail image to a file
165 write_all_thumbnails: Write all thumbnail formats to files
166 writesubtitles: Write the video subtitles to a file
167 writeautomaticsub: Write the automatically generated subtitles to a file
168 allsubtitles: Downloads all the subtitles of the video
169 (requires writesubtitles or writeautomaticsub)
170 listsubtitles: Lists all available subtitles for the video
171 subtitlesformat: The format code for subtitles
172 subtitleslangs: List of languages of the subtitles to download
173 keepvideo: Keep the video file after post-processing
174 daterange: A DateRange object, download only if the upload_date is in the range.
175 skip_download: Skip the actual download of the video file
176 cachedir: Location of the cache files in the filesystem.
177 False to disable filesystem cache.
178 noplaylist: Download single video instead of a playlist if in doubt.
179 age_limit: An integer representing the user's age in years.
180 Unsuitable videos for the given age are skipped.
181 min_views: An integer representing the minimum view count the video
182 must have in order to not be skipped.
183 Videos without view count information are always
184 downloaded. None for no limit.
185 max_views: An integer representing the maximum view count.
186 Videos that are more popular than that are not
187 downloaded.
188 Videos without view count information are always
189 downloaded. None for no limit.
190 download_archive: File name of a file where all downloads are recorded.
191 Videos already present in the file are not downloaded
192 again.
193 cookiefile: File name where cookies should be read from and dumped to.
194 nocheckcertificate:Do not verify SSL certificates
195 prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
196 At the moment, this is only supported by YouTube.
197 proxy: URL of the proxy server to use
198 cn_verification_proxy: URL of the proxy to use for IP address verification
199 on Chinese sites. (Experimental)
200 socket_timeout: Time to wait for unresponsive hosts, in seconds
201 bidi_workaround: Work around buggy terminals without bidirectional text
202 support, using fridibi
203 debug_printtraffic:Print out sent and received HTTP traffic
204 include_ads: Download ads as well
205 default_search: Prepend this string if an input url is not valid.
206 'auto' for elaborate guessing
207 encoding: Use this encoding instead of the system-specified.
208 extract_flat: Do not resolve URLs, return the immediate result.
209 Pass in 'in_playlist' to only show this behavior for
210 playlist items.
211 postprocessors: A list of dictionaries, each with an entry
212 * key: The name of the postprocessor. See
213 youtube_dl/postprocessor/__init__.py for a list.
214 as well as any further keyword arguments for the
215 postprocessor.
216 progress_hooks: A list of functions that get called on download
217 progress, with a dictionary with the entries
218 * status: One of "downloading", "error", or "finished".
219 Check this first and ignore unknown values.
220
221 If status is one of "downloading", or "finished", the
222 following properties may also be present:
223 * filename: The final filename (always present)
224 * tmpfilename: The filename we're currently writing to
225 * downloaded_bytes: Bytes on disk
226 * total_bytes: Size of the whole file, None if unknown
227 * total_bytes_estimate: Guess of the eventual file size,
228 None if unavailable.
229 * elapsed: The number of seconds since download started.
230 * eta: The estimated time in seconds, None if unknown
231 * speed: The download speed in bytes/second, None if
232 unknown
233 * fragment_index: The counter of the currently
234 downloaded video fragment.
235 * fragment_count: The number of fragments (= individual
236 files that will be merged)
237
238 Progress hooks are guaranteed to be called at least once
239 (with status "finished") if the download is successful.
240 merge_output_format: Extension to use when merging formats.
241 fixup: Automatically correct known faults of the file.
242 One of:
243 - "never": do nothing
244 - "warn": only emit a warning
245 - "detect_or_warn": check whether we can do anything
246 about it, warn otherwise (default)
247 source_address: (Experimental) Client-side IP address to bind to.
248 call_home: Boolean, true iff we are allowed to contact the
249 youtube-dl servers for debugging.
250 sleep_interval: Number of seconds to sleep before each download.
251 listformats: Print an overview of available video formats and exit.
252 list_thumbnails: Print a table of all thumbnails and exit.
253 match_filter: A function that gets called with the info_dict of
254 every video.
255 If it returns a message, the video is ignored.
256 If it returns None, the video is downloaded.
257 match_filter_func in utils.py is one example for this.
258 no_color: Do not emit color codes in output.
259
260 The following options determine which downloader is picked:
261 external_downloader: Executable of the external downloader to call.
262 None or unset for standard (built-in) downloader.
263 hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
264
265 The following parameters are not used by YoutubeDL itself, they are used by
266 the downloader (see youtube_dl/downloader/common.py):
267 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
268 noresizebuffer, retries, continuedl, noprogress, consoletitle,
269 xattr_set_filesize, external_downloader_args, hls_use_mpegts.
270
271 The following options are used by the post processors:
272 prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
273 otherwise prefer avconv.
274 postprocessor_args: A list of additional command-line arguments for the
275 postprocessor.
276 """
277
278 params = None
279 _ies = []
280 _pps = []
281 _download_retcode = None
282 _num_downloads = None
283 _screen_file = None
284
285 def __init__(self, params=None, auto_init=True):
286 """Create a FileDownloader object with the given options."""
287 if params is None:
288 params = {}
289 self._ies = []
290 self._ies_instances = {}
291 self._pps = []
292 self._progress_hooks = []
293 self._download_retcode = 0
294 self._num_downloads = 0
295 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
296 self._err_file = sys.stderr
297 self.params = {
298 # Default parameters
299 'nocheckcertificate': False,
300 }
301 self.params.update(params)
302 self.cache = Cache(self)
303
304 if params.get('bidi_workaround', False):
305 try:
306 import pty
307 master, slave = pty.openpty()
308 width = compat_get_terminal_size().columns
309 if width is None:
310 width_args = []
311 else:
312 width_args = ['-w', str(width)]
313 sp_kwargs = dict(
314 stdin=subprocess.PIPE,
315 stdout=slave,
316 stderr=self._err_file)
317 try:
318 self._output_process = subprocess.Popen(
319 ['bidiv'] + width_args, **sp_kwargs
320 )
321 except OSError:
322 self._output_process = subprocess.Popen(
323 ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
324 self._output_channel = os.fdopen(master, 'rb')
325 except OSError as ose:
326 if ose.errno == 2:
327 self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
328 else:
329 raise
330
331 if (sys.version_info >= (3,) and sys.platform != 'win32' and
332 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
333 not params.get('restrictfilenames', False)):
334 # On Python 3, the Unicode filesystem API will throw errors (#1474)
335 self.report_warning(
336 'Assuming --restrict-filenames since file system encoding '
337 'cannot encode all characters. '
338 'Set the LC_ALL environment variable to fix this.')
339 self.params['restrictfilenames'] = True
340
341 if isinstance(params.get('outtmpl'), bytes):
342 self.report_warning(
343 'Parameter outtmpl is bytes, but should be a unicode string. '
344 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
345
346 self._setup_opener()
347
348 if auto_init:
349 self.print_debug_header()
350 self.add_default_info_extractors()
351
352 for pp_def_raw in self.params.get('postprocessors', []):
353 pp_class = get_postprocessor(pp_def_raw['key'])
354 pp_def = dict(pp_def_raw)
355 del pp_def['key']
356 pp = pp_class(self, **compat_kwargs(pp_def))
357 self.add_post_processor(pp)
358
359 for ph in self.params.get('progress_hooks', []):
360 self.add_progress_hook(ph)
361
362 def warn_if_short_id(self, argv):
363 # short YouTube ID starting with dash?
364 idxs = [
365 i for i, a in enumerate(argv)
366 if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
367 if idxs:
368 correct_argv = (
369 ['youtube-dl'] +
370 [a for i, a in enumerate(argv) if i not in idxs] +
371 ['--'] + [argv[i] for i in idxs]
372 )
373 self.report_warning(
374 'Long argument string detected. '
375 'Use -- to separate parameters and URLs, like this:\n%s\n' %
376 args_to_str(correct_argv))
377
378 def add_info_extractor(self, ie):
379 """Add an InfoExtractor object to the end of the list."""
380 self._ies.append(ie)
381 if not isinstance(ie, type):
382 self._ies_instances[ie.ie_key()] = ie
383 ie.set_downloader(self)
384
385 def get_info_extractor(self, ie_key):
386 """
387 Get an instance of an IE with name ie_key, it will try to get one from
388 the _ies list, if there's no instance it will create a new one and add
389 it to the extractor list.
390 """
391 ie = self._ies_instances.get(ie_key)
392 if ie is None:
393 ie = get_info_extractor(ie_key)()
394 self.add_info_extractor(ie)
395 return ie
396
397 def add_default_info_extractors(self):
398 """
399 Add the InfoExtractors returned by gen_extractors to the end of the list
400 """
401 for ie in gen_extractor_classes():
402 self.add_info_extractor(ie)
403
404 def add_post_processor(self, pp):
405 """Add a PostProcessor object to the end of the chain."""
406 self._pps.append(pp)
407 pp.set_downloader(self)
408
409 def add_progress_hook(self, ph):
410 """Add the progress hook (currently only for the file downloader)"""
411 self._progress_hooks.append(ph)
412
413 def _bidi_workaround(self, message):
414 if not hasattr(self, '_output_channel'):
415 return message
416
417 assert hasattr(self, '_output_process')
418 assert isinstance(message, compat_str)
419 line_count = message.count('\n') + 1
420 self._output_process.stdin.write((message + '\n').encode('utf-8'))
421 self._output_process.stdin.flush()
422 res = ''.join(self._output_channel.readline().decode('utf-8')
423 for _ in range(line_count))
424 return res[:-len('\n')]
425
426 def to_screen(self, message, skip_eol=False):
427 """Print message to stdout if not in quiet mode."""
428 return self.to_stdout(message, skip_eol, check_quiet=True)
429
430 def _write_string(self, s, out=None):
431 write_string(s, out=out, encoding=self.params.get('encoding'))
432
433 def to_stdout(self, message, skip_eol=False, check_quiet=False):
434 """Print message to stdout if not in quiet mode."""
435 if self.params.get('logger'):
436 self.params['logger'].debug(message)
437 elif not check_quiet or not self.params.get('quiet', False):
438 message = self._bidi_workaround(message)
439 terminator = ['\n', ''][skip_eol]
440 output = message + terminator
441
442 self._write_string(output, self._screen_file)
443
444 def to_stderr(self, message):
445 """Print message to stderr."""
446 assert isinstance(message, compat_str)
447 if self.params.get('logger'):
448 self.params['logger'].error(message)
449 else:
450 message = self._bidi_workaround(message)
451 output = message + '\n'
452 self._write_string(output, self._err_file)
453
454 def to_console_title(self, message):
455 if not self.params.get('consoletitle', False):
456 return
457 if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
458 # c_wchar_p() might not be necessary if `message` is
459 # already of type unicode()
460 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
461 elif 'TERM' in os.environ:
462 self._write_string('\033]0;%s\007' % message, self._screen_file)
463
464 def save_console_title(self):
465 if not self.params.get('consoletitle', False):
466 return
467 if 'TERM' in os.environ:
468 # Save the title on stack
469 self._write_string('\033[22;0t', self._screen_file)
470
471 def restore_console_title(self):
472 if not self.params.get('consoletitle', False):
473 return
474 if 'TERM' in os.environ:
475 # Restore the title from stack
476 self._write_string('\033[23;0t', self._screen_file)
477
478 def __enter__(self):
479 self.save_console_title()
480 return self
481
482 def __exit__(self, *args):
483 self.restore_console_title()
484
485 if self.params.get('cookiefile') is not None:
486 self.cookiejar.save()
487
488 def trouble(self, message=None, tb=None):
489 """Determine action to take when a download problem appears.
490
491 Depending on if the downloader has been configured to ignore
492 download errors or not, this method may throw an exception or
493 not when errors are found, after printing the message.
494
495 tb, if given, is additional traceback information.
496 """
497 if message is not None:
498 self.to_stderr(message)
499 if self.params.get('verbose'):
500 if tb is None:
501 if sys.exc_info()[0]: # if .trouble has been called from an except block
502 tb = ''
503 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
504 tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
505 tb += encode_compat_str(traceback.format_exc())
506 else:
507 tb_data = traceback.format_list(traceback.extract_stack())
508 tb = ''.join(tb_data)
509 self.to_stderr(tb)
510 if not self.params.get('ignoreerrors', False):
511 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
512 exc_info = sys.exc_info()[1].exc_info
513 else:
514 exc_info = sys.exc_info()
515 raise DownloadError(message, exc_info)
516 self._download_retcode = 1
517
518 def report_warning(self, message):
519 '''
520 Print the message to stderr, it will be prefixed with 'WARNING:'
521 If stderr is a tty file the 'WARNING:' will be colored
522 '''
523 if self.params.get('logger') is not None:
524 self.params['logger'].warning(message)
525 else:
526 if self.params.get('no_warnings'):
527 return
528 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
529 _msg_header = '\033[0;33mWARNING:\033[0m'
530 else:
531 _msg_header = 'WARNING:'
532 warning_message = '%s %s' % (_msg_header, message)
533 self.to_stderr(warning_message)
534
535 def report_error(self, message, tb=None):
536 '''
537 Do the same as trouble, but prefixes the message with 'ERROR:', colored
538 in red if stderr is a tty file.
539 '''
540 if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
541 _msg_header = '\033[0;31mERROR:\033[0m'
542 else:
543 _msg_header = 'ERROR:'
544 error_message = '%s %s' % (_msg_header, message)
545 self.trouble(error_message, tb)
546
547 def report_file_already_downloaded(self, file_name):
548 """Report file has already been fully downloaded."""
549 try:
550 self.to_screen('[download] %s has already been downloaded' % file_name)
551 except UnicodeEncodeError:
552 self.to_screen('[download] The file has already been downloaded')
553
554 def prepare_filename(self, info_dict):
555 """Generate the output filename."""
556 try:
557 template_dict = dict(info_dict)
558
559 template_dict['epoch'] = int(time.time())
560 autonumber_size = self.params.get('autonumber_size')
561 if autonumber_size is None:
562 autonumber_size = 5
563 autonumber_templ = '%0' + str(autonumber_size) + 'd'
564 template_dict['autonumber'] = autonumber_templ % self._num_downloads
565 if template_dict.get('playlist_index') is not None:
566 template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
567 if template_dict.get('resolution') is None:
568 if template_dict.get('width') and template_dict.get('height'):
569 template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
570 elif template_dict.get('height'):
571 template_dict['resolution'] = '%sp' % template_dict['height']
572 elif template_dict.get('width'):
573 template_dict['resolution'] = '%dx?' % template_dict['width']
574
575 sanitize = lambda k, v: sanitize_filename(
576 compat_str(v),
577 restricted=self.params.get('restrictfilenames'),
578 is_id=(k == 'id'))
579 template_dict = dict((k, sanitize(k, v))
580 for k, v in template_dict.items()
581 if v is not None)
582 template_dict = collections.defaultdict(lambda: 'NA', template_dict)
583
584 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
585 tmpl = compat_expanduser(outtmpl)
586 filename = tmpl % template_dict
587 # Temporary fix for #4787
588 # 'Treat' all problem characters by passing filename through preferredencoding
589 # to workaround encoding issues with subprocess on python2 @ Windows
590 if sys.version_info < (3, 0) and sys.platform == 'win32':
591 filename = encodeFilename(filename, True).decode(preferredencoding())
592 return sanitize_path(filename)
593 except ValueError as err:
594 self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
595 return None
596
597 def _match_entry(self, info_dict, incomplete):
598 """ Returns None iff the file should be downloaded """
599
600 video_title = info_dict.get('title', info_dict.get('id', 'video'))
601 if 'title' in info_dict:
602 # This can happen when we're just evaluating the playlist
603 title = info_dict['title']
604 matchtitle = self.params.get('matchtitle', False)
605 if matchtitle:
606 if not re.search(matchtitle, title, re.IGNORECASE):
607 return '"' + title + '" title did not match pattern "' + matchtitle + '"'
608 rejecttitle = self.params.get('rejecttitle', False)
609 if rejecttitle:
610 if re.search(rejecttitle, title, re.IGNORECASE):
611 return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
612 date = info_dict.get('upload_date')
613 if date is not None:
614 dateRange = self.params.get('daterange', DateRange())
615 if date not in dateRange:
616 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
617 view_count = info_dict.get('view_count')
618 if view_count is not None:
619 min_views = self.params.get('min_views')
620 if min_views is not None and view_count < min_views:
621 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
622 max_views = self.params.get('max_views')
623 if max_views is not None and view_count > max_views:
624 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
625 if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
626 return 'Skipping "%s" because it is age restricted' % video_title
627 if self.in_download_archive(info_dict):
628 return '%s has already been recorded in archive' % video_title
629
630 if not incomplete:
631 match_filter = self.params.get('match_filter')
632 if match_filter is not None:
633 ret = match_filter(info_dict)
634 if ret is not None:
635 return ret
636
637 return None
638
639 @staticmethod
640 def add_extra_info(info_dict, extra_info):
641 '''Set the keys from extra_info in info dict if they are missing'''
642 for key, value in extra_info.items():
643 info_dict.setdefault(key, value)
644
645 def extract_info(self, url, download=True, ie_key=None, extra_info={},
646 process=True, force_generic_extractor=False):
647 '''
648 Returns a list with a dictionary for each video we find.
649 If 'download', also downloads the videos.
650 extra_info is a dict containing the extra values to add to each result
651 '''
652
653 if not ie_key and force_generic_extractor:
654 ie_key = 'Generic'
655
656 if ie_key:
657 ies = [self.get_info_extractor(ie_key)]
658 else:
659 ies = self._ies
660
661 for ie in ies:
662 if not ie.suitable(url):
663 continue
664
665 ie = self.get_info_extractor(ie.ie_key())
666 if not ie.working():
667 self.report_warning('The program functionality for this site has been marked as broken, '
668 'and will probably not work.')
669
670 try:
671 ie_result = ie.extract(url)
672 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
673 break
674 if isinstance(ie_result, list):
675 # Backwards compatibility: old IE result format
676 ie_result = {
677 '_type': 'compat_list',
678 'entries': ie_result,
679 }
680 self.add_default_extra_info(ie_result, ie, url)
681 if process:
682 return self.process_ie_result(ie_result, download, extra_info)
683 else:
684 return ie_result
685 except ExtractorError as e: # An error we somewhat expected
686 self.report_error(compat_str(e), e.format_traceback())
687 break
688 except MaxDownloadsReached:
689 raise
690 except Exception as e:
691 if self.params.get('ignoreerrors', False):
692 self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
693 break
694 else:
695 raise
696 else:
697 self.report_error('no suitable InfoExtractor for URL %s' % url)
698
699 def add_default_extra_info(self, ie_result, ie, url):
700 self.add_extra_info(ie_result, {
701 'extractor': ie.IE_NAME,
702 'webpage_url': url,
703 'webpage_url_basename': url_basename(url),
704 'extractor_key': ie.ie_key(),
705 })
706
707 def process_ie_result(self, ie_result, download=True, extra_info={}):
708 """
709 Take the result of the ie(may be modified) and resolve all unresolved
710 references (URLs, playlist items).
711
712 It will also download the videos if 'download'.
713 Returns the resolved ie_result.
714 """
715 result_type = ie_result.get('_type', 'video')
716
717 if result_type in ('url', 'url_transparent'):
718 extract_flat = self.params.get('extract_flat', False)
719 if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
720 extract_flat is True):
721 if self.params.get('forcejson', False):
722 self.to_stdout(json.dumps(ie_result))
723 return ie_result
724
725 if result_type == 'video':
726 self.add_extra_info(ie_result, extra_info)
727 return self.process_video_result(ie_result, download=download)
728 elif result_type == 'url':
729 # We have to add extra_info to the results because it may be
730 # contained in a playlist
731 return self.extract_info(ie_result['url'],
732 download,
733 ie_key=ie_result.get('ie_key'),
734 extra_info=extra_info)
735 elif result_type == 'url_transparent':
736 # Use the information from the embedding page
737 info = self.extract_info(
738 ie_result['url'], ie_key=ie_result.get('ie_key'),
739 extra_info=extra_info, download=False, process=False)
740
741 force_properties = dict(
742 (k, v) for k, v in ie_result.items() if v is not None)
743 for f in ('_type', 'url', 'ie_key'):
744 if f in force_properties:
745 del force_properties[f]
746 new_result = info.copy()
747 new_result.update(force_properties)
748
749 assert new_result.get('_type') != 'url_transparent'
750
751 return self.process_ie_result(
752 new_result, download=download, extra_info=extra_info)
753 elif result_type == 'playlist' or result_type == 'multi_video':
754 # We process each entry in the playlist
755 playlist = ie_result.get('title') or ie_result.get('id')
756 self.to_screen('[download] Downloading playlist: %s' % playlist)
757
758 playlist_results = []
759
760 playliststart = self.params.get('playliststart', 1) - 1
761 playlistend = self.params.get('playlistend')
762 # For backwards compatibility, interpret -1 as whole list
763 if playlistend == -1:
764 playlistend = None
765
766 playlistitems_str = self.params.get('playlist_items')
767 playlistitems = None
768 if playlistitems_str is not None:
769 def iter_playlistitems(format):
770 for string_segment in format.split(','):
771 if '-' in string_segment:
772 start, end = string_segment.split('-')
773 for item in range(int(start), int(end) + 1):
774 yield int(item)
775 else:
776 yield int(string_segment)
777 playlistitems = iter_playlistitems(playlistitems_str)
778
779 ie_entries = ie_result['entries']
780 if isinstance(ie_entries, list):
781 n_all_entries = len(ie_entries)
782 if playlistitems:
783 entries = [
784 ie_entries[i - 1] for i in playlistitems
785 if -n_all_entries <= i - 1 < n_all_entries]
786 else:
787 entries = ie_entries[playliststart:playlistend]
788 n_entries = len(entries)
789 self.to_screen(
790 '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
791 (ie_result['extractor'], playlist, n_all_entries, n_entries))
792 elif isinstance(ie_entries, PagedList):
793 if playlistitems:
794 entries = []
795 for item in playlistitems:
796 entries.extend(ie_entries.getslice(
797 item - 1, item
798 ))
799 else:
800 entries = ie_entries.getslice(
801 playliststart, playlistend)
802 n_entries = len(entries)
803 self.to_screen(
804 '[%s] playlist %s: Downloading %d videos' %
805 (ie_result['extractor'], playlist, n_entries))
806 else: # iterable
807 if playlistitems:
808 entry_list = list(ie_entries)
809 entries = [entry_list[i - 1] for i in playlistitems]
810 else:
811 entries = list(itertools.islice(
812 ie_entries, playliststart, playlistend))
813 n_entries = len(entries)
814 self.to_screen(
815 '[%s] playlist %s: Downloading %d videos' %
816 (ie_result['extractor'], playlist, n_entries))
817
818 if self.params.get('playlistreverse', False):
819 entries = entries[::-1]
820
821 for i, entry in enumerate(entries, 1):
822 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
823 extra = {
824 'n_entries': n_entries,
825 'playlist': playlist,
826 'playlist_id': ie_result.get('id'),
827 'playlist_title': ie_result.get('title'),
828 'playlist_index': i + playliststart,
829 'extractor': ie_result['extractor'],
830 'webpage_url': ie_result['webpage_url'],
831 'webpage_url_basename': url_basename(ie_result['webpage_url']),
832 'extractor_key': ie_result['extractor_key'],
833 }
834
835 reason = self._match_entry(entry, incomplete=True)
836 if reason is not None:
837 self.to_screen('[download] ' + reason)
838 continue
839
840 entry_result = self.process_ie_result(entry,
841 download=download,
842 extra_info=extra)
843 playlist_results.append(entry_result)
844 ie_result['entries'] = playlist_results
845 self.to_screen('[download] Finished downloading playlist: %s' % playlist)
846 return ie_result
847 elif result_type == 'compat_list':
848 self.report_warning(
849 'Extractor %s returned a compat_list result. '
850 'It needs to be updated.' % ie_result.get('extractor'))
851
852 def _fixup(r):
853 self.add_extra_info(
854 r,
855 {
856 'extractor': ie_result['extractor'],
857 'webpage_url': ie_result['webpage_url'],
858 'webpage_url_basename': url_basename(ie_result['webpage_url']),
859 'extractor_key': ie_result['extractor_key'],
860 }
861 )
862 return r
863 ie_result['entries'] = [
864 self.process_ie_result(_fixup(r), download, extra_info)
865 for r in ie_result['entries']
866 ]
867 return ie_result
868 else:
869 raise Exception('Invalid result type: %s' % result_type)
870
871 def _build_format_filter(self, filter_spec):
872 " Returns a function to filter the formats according to the filter_spec "
873
874 OPERATORS = {
875 '<': operator.lt,
876 '<=': operator.le,
877 '>': operator.gt,
878 '>=': operator.ge,
879 '=': operator.eq,
880 '!=': operator.ne,
881 }
882 operator_rex = re.compile(r'''(?x)\s*
883 (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
884 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
885 (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
886 $
887 ''' % '|'.join(map(re.escape, OPERATORS.keys())))
888 m = operator_rex.search(filter_spec)
889 if m:
890 try:
891 comparison_value = int(m.group('value'))
892 except ValueError:
893 comparison_value = parse_filesize(m.group('value'))
894 if comparison_value is None:
895 comparison_value = parse_filesize(m.group('value') + 'B')
896 if comparison_value is None:
897 raise ValueError(
898 'Invalid value %r in format specification %r' % (
899 m.group('value'), filter_spec))
900 op = OPERATORS[m.group('op')]
901
902 if not m:
903 STR_OPERATORS = {
904 '=': operator.eq,
905 '!=': operator.ne,
906 '^=': lambda attr, value: attr.startswith(value),
907 '$=': lambda attr, value: attr.endswith(value),
908 '*=': lambda attr, value: value in attr,
909 }
910 str_operator_rex = re.compile(r'''(?x)
911 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
912 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
913 \s*(?P<value>[a-zA-Z0-9._-]+)
914 \s*$
915 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
916 m = str_operator_rex.search(filter_spec)
917 if m:
918 comparison_value = m.group('value')
919 op = STR_OPERATORS[m.group('op')]
920
921 if not m:
922 raise ValueError('Invalid filter specification %r' % filter_spec)
923
924 def _filter(f):
925 actual_value = f.get(m.group('key'))
926 if actual_value is None:
927 return m.group('none_inclusive')
928 return op(actual_value, comparison_value)
929 return _filter
930
931 def build_format_selector(self, format_spec):
932 def syntax_error(note, start):
933 message = (
934 'Invalid format specification: '
935 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
936 return SyntaxError(message)
937
938 PICKFIRST = 'PICKFIRST'
939 MERGE = 'MERGE'
940 SINGLE = 'SINGLE'
941 GROUP = 'GROUP'
942 FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
943
944 def _parse_filter(tokens):
945 filter_parts = []
946 for type, string, start, _, _ in tokens:
947 if type == tokenize.OP and string == ']':
948 return ''.join(filter_parts)
949 else:
950 filter_parts.append(string)
951
952 def _remove_unused_ops(tokens):
953 # Remove operators that we don't use and join them with the surrounding strings
954 # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
955 ALLOWED_OPS = ('/', '+', ',', '(', ')')
956 last_string, last_start, last_end, last_line = None, None, None, None
957 for type, string, start, end, line in tokens:
958 if type == tokenize.OP and string == '[':
959 if last_string:
960 yield tokenize.NAME, last_string, last_start, last_end, last_line
961 last_string = None
962 yield type, string, start, end, line
963 # everything inside brackets will be handled by _parse_filter
964 for type, string, start, end, line in tokens:
965 yield type, string, start, end, line
966 if type == tokenize.OP and string == ']':
967 break
968 elif type == tokenize.OP and string in ALLOWED_OPS:
969 if last_string:
970 yield tokenize.NAME, last_string, last_start, last_end, last_line
971 last_string = None
972 yield type, string, start, end, line
973 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
974 if not last_string:
975 last_string = string
976 last_start = start
977 last_end = end
978 else:
979 last_string += string
980 if last_string:
981 yield tokenize.NAME, last_string, last_start, last_end, last_line
982
983 def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
984 selectors = []
985 current_selector = None
986 for type, string, start, _, _ in tokens:
987 # ENCODING is only defined in python 3.x
988 if type == getattr(tokenize, 'ENCODING', None):
989 continue
990 elif type in [tokenize.NAME, tokenize.NUMBER]:
991 current_selector = FormatSelector(SINGLE, string, [])
992 elif type == tokenize.OP:
993 if string == ')':
994 if not inside_group:
995 # ')' will be handled by the parentheses group
996 tokens.restore_last_token()
997 break
998 elif inside_merge and string in ['/', ',']:
999 tokens.restore_last_token()
1000 break
1001 elif inside_choice and string == ',':
1002 tokens.restore_last_token()
1003 break
1004 elif string == ',':
1005 if not current_selector:
1006 raise syntax_error('"," must follow a format selector', start)
1007 selectors.append(current_selector)
1008 current_selector = None
1009 elif string == '/':
1010 if not current_selector:
1011 raise syntax_error('"/" must follow a format selector', start)
1012 first_choice = current_selector
1013 second_choice = _parse_format_selection(tokens, inside_choice=True)
1014 current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1015 elif string == '[':
1016 if not current_selector:
1017 current_selector = FormatSelector(SINGLE, 'best', [])
1018 format_filter = _parse_filter(tokens)
1019 current_selector.filters.append(format_filter)
1020 elif string == '(':
1021 if current_selector:
1022 raise syntax_error('Unexpected "("', start)
1023 group = _parse_format_selection(tokens, inside_group=True)
1024 current_selector = FormatSelector(GROUP, group, [])
1025 elif string == '+':
1026 video_selector = current_selector
1027 audio_selector = _parse_format_selection(tokens, inside_merge=True)
1028 if not video_selector or not audio_selector:
1029 raise syntax_error('"+" must be between two format selectors', start)
1030 current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1031 else:
1032 raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1033 elif type == tokenize.ENDMARKER:
1034 break
1035 if current_selector:
1036 selectors.append(current_selector)
1037 return selectors
1038
1039 def _build_selector_function(selector):
1040 if isinstance(selector, list):
1041 fs = [_build_selector_function(s) for s in selector]
1042
1043 def selector_function(formats):
1044 for f in fs:
1045 for format in f(formats):
1046 yield format
1047 return selector_function
1048 elif selector.type == GROUP:
1049 selector_function = _build_selector_function(selector.selector)
1050 elif selector.type == PICKFIRST:
1051 fs = [_build_selector_function(s) for s in selector.selector]
1052
1053 def selector_function(formats):
1054 for f in fs:
1055 picked_formats = list(f(formats))
1056 if picked_formats:
1057 return picked_formats
1058 return []
1059 elif selector.type == SINGLE:
1060 format_spec = selector.selector
1061
1062 def selector_function(formats):
1063 formats = list(formats)
1064 if not formats:
1065 return
1066 if format_spec == 'all':
1067 for f in formats:
1068 yield f
1069 elif format_spec in ['best', 'worst', None]:
1070 format_idx = 0 if format_spec == 'worst' else -1
1071 audiovideo_formats = [
1072 f for f in formats
1073 if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1074 if audiovideo_formats:
1075 yield audiovideo_formats[format_idx]
1076 # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
1077 elif (all(f.get('acodec') != 'none' for f in formats) or
1078 all(f.get('vcodec') != 'none' for f in formats)):
1079 yield formats[format_idx]
1080 elif format_spec == 'bestaudio':
1081 audio_formats = [
1082 f for f in formats
1083 if f.get('vcodec') == 'none']
1084 if audio_formats:
1085 yield audio_formats[-1]
1086 elif format_spec == 'worstaudio':
1087 audio_formats = [
1088 f for f in formats
1089 if f.get('vcodec') == 'none']
1090 if audio_formats:
1091 yield audio_formats[0]
1092 elif format_spec == 'bestvideo':
1093 video_formats = [
1094 f for f in formats
1095 if f.get('acodec') == 'none']
1096 if video_formats:
1097 yield video_formats[-1]
1098 elif format_spec == 'worstvideo':
1099 video_formats = [
1100 f for f in formats
1101 if f.get('acodec') == 'none']
1102 if video_formats:
1103 yield video_formats[0]
1104 else:
1105 extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1106 if format_spec in extensions:
1107 filter_f = lambda f: f['ext'] == format_spec
1108 else:
1109 filter_f = lambda f: f['format_id'] == format_spec
1110 matches = list(filter(filter_f, formats))
1111 if matches:
1112 yield matches[-1]
1113 elif selector.type == MERGE:
1114 def _merge(formats_info):
1115 format_1, format_2 = [f['format_id'] for f in formats_info]
1116 # The first format must contain the video and the
1117 # second the audio
1118 if formats_info[0].get('vcodec') == 'none':
1119 self.report_error('The first format must '
1120 'contain the video, try using '
1121 '"-f %s+%s"' % (format_2, format_1))
1122 return
1123 # Formats must be opposite (video+audio)
1124 if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1125 self.report_error(
1126 'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1127 % (format_1, format_2))
1128 return
1129 output_ext = (
1130 formats_info[0]['ext']
1131 if self.params.get('merge_output_format') is None
1132 else self.params['merge_output_format'])
1133 return {
1134 'requested_formats': formats_info,
1135 'format': '%s+%s' % (formats_info[0].get('format'),
1136 formats_info[1].get('format')),
1137 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1138 formats_info[1].get('format_id')),
1139 'width': formats_info[0].get('width'),
1140 'height': formats_info[0].get('height'),
1141 'resolution': formats_info[0].get('resolution'),
1142 'fps': formats_info[0].get('fps'),
1143 'vcodec': formats_info[0].get('vcodec'),
1144 'vbr': formats_info[0].get('vbr'),
1145 'stretched_ratio': formats_info[0].get('stretched_ratio'),
1146 'acodec': formats_info[1].get('acodec'),
1147 'abr': formats_info[1].get('abr'),
1148 'ext': output_ext,
1149 }
1150 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1151
1152 def selector_function(formats):
1153 formats = list(formats)
1154 for pair in itertools.product(video_selector(formats), audio_selector(formats)):
1155 yield _merge(pair)
1156
1157 filters = [self._build_format_filter(f) for f in selector.filters]
1158
1159 def final_selector(formats):
1160 for _filter in filters:
1161 formats = list(filter(_filter, formats))
1162 return selector_function(formats)
1163 return final_selector
1164
1165 stream = io.BytesIO(format_spec.encode('utf-8'))
1166 try:
1167 tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1168 except tokenize.TokenError:
1169 raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1170
1171 class TokenIterator(object):
1172 def __init__(self, tokens):
1173 self.tokens = tokens
1174 self.counter = 0
1175
1176 def __iter__(self):
1177 return self
1178
1179 def __next__(self):
1180 if self.counter >= len(self.tokens):
1181 raise StopIteration()
1182 value = self.tokens[self.counter]
1183 self.counter += 1
1184 return value
1185
1186 next = __next__
1187
1188 def restore_last_token(self):
1189 self.counter -= 1
1190
1191 parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1192 return _build_selector_function(parsed_selector)
1193
1194 def _calc_headers(self, info_dict):
1195 res = std_headers.copy()
1196
1197 add_headers = info_dict.get('http_headers')
1198 if add_headers:
1199 res.update(add_headers)
1200
1201 cookies = self._calc_cookies(info_dict)
1202 if cookies:
1203 res['Cookie'] = cookies
1204
1205 return res
1206
1207 def _calc_cookies(self, info_dict):
1208 pr = sanitized_Request(info_dict['url'])
1209 self.cookiejar.add_cookie_header(pr)
1210 return pr.get_header('Cookie')
1211
1212 def process_video_result(self, info_dict, download=True):
1213 assert info_dict.get('_type', 'video') == 'video'
1214
1215 if 'id' not in info_dict:
1216 raise ExtractorError('Missing "id" field in extractor result')
1217 if 'title' not in info_dict:
1218 raise ExtractorError('Missing "title" field in extractor result')
1219
1220 if 'playlist' not in info_dict:
1221 # It isn't part of a playlist
1222 info_dict['playlist'] = None
1223 info_dict['playlist_index'] = None
1224
1225 thumbnails = info_dict.get('thumbnails')
1226 if thumbnails is None:
1227 thumbnail = info_dict.get('thumbnail')
1228 if thumbnail:
1229 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1230 if thumbnails:
1231 thumbnails.sort(key=lambda t: (
1232 t.get('preference'), t.get('width'), t.get('height'),
1233 t.get('id'), t.get('url')))
1234 for i, t in enumerate(thumbnails):
1235 t['url'] = sanitize_url(t['url'])
1236 if t.get('width') and t.get('height'):
1237 t['resolution'] = '%dx%d' % (t['width'], t['height'])
1238 if t.get('id') is None:
1239 t['id'] = '%d' % i
1240
1241 if self.params.get('list_thumbnails'):
1242 self.list_thumbnails(info_dict)
1243 return
1244
1245 thumbnail = info_dict.get('thumbnail')
1246 if thumbnail:
1247 info_dict['thumbnail'] = sanitize_url(thumbnail)
1248 elif thumbnails:
1249 info_dict['thumbnail'] = thumbnails[-1]['url']
1250
1251 if 'display_id' not in info_dict and 'id' in info_dict:
1252 info_dict['display_id'] = info_dict['id']
1253
1254 if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1255 # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1256 # see http://bugs.python.org/issue1646728)
1257 try:
1258 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1259 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1260 except (ValueError, OverflowError, OSError):
1261 pass
1262
1263 # Auto generate title fields corresponding to the *_number fields when missing
1264 # in order to always have clean titles. This is very common for TV series.
1265 for field in ('chapter', 'season', 'episode'):
1266 if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1267 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1268
1269 subtitles = info_dict.get('subtitles')
1270 if subtitles:
1271 for _, subtitle in subtitles.items():
1272 for subtitle_format in subtitle:
1273 if subtitle_format.get('url'):
1274 subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1275 if 'ext' not in subtitle_format:
1276 subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1277
1278 if self.params.get('listsubtitles', False):
1279 if 'automatic_captions' in info_dict:
1280 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1281 self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1282 return
1283 info_dict['requested_subtitles'] = self.process_subtitles(
1284 info_dict['id'], subtitles,
1285 info_dict.get('automatic_captions'))
1286
1287 # We now pick which formats have to be downloaded
1288 if info_dict.get('formats') is None:
1289 # There's only one format available
1290 formats = [info_dict]
1291 else:
1292 formats = info_dict['formats']
1293
1294 if not formats:
1295 raise ExtractorError('No video formats found!')
1296
1297 formats_dict = {}
1298
1299 # We check that all the formats have the format and format_id fields
1300 for i, format in enumerate(formats):
1301 if 'url' not in format:
1302 raise ExtractorError('Missing "url" key in result (index %d)' % i)
1303
1304 format['url'] = sanitize_url(format['url'])
1305
1306 if format.get('format_id') is None:
1307 format['format_id'] = compat_str(i)
1308 else:
1309 # Sanitize format_id from characters used in format selector expression
1310 format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
1311 format_id = format['format_id']
1312 if format_id not in formats_dict:
1313 formats_dict[format_id] = []
1314 formats_dict[format_id].append(format)
1315
1316 # Make sure all formats have unique format_id
1317 for format_id, ambiguous_formats in formats_dict.items():
1318 if len(ambiguous_formats) > 1:
1319 for i, format in enumerate(ambiguous_formats):
1320 format['format_id'] = '%s-%d' % (format_id, i)
1321
1322 for i, format in enumerate(formats):
1323 if format.get('format') is None:
1324 format['format'] = '{id} - {res}{note}'.format(
1325 id=format['format_id'],
1326 res=self.format_resolution(format),
1327 note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1328 )
1329 # Automatically determine file extension if missing
1330 if 'ext' not in format:
1331 format['ext'] = determine_ext(format['url']).lower()
1332 # Automatically determine protocol if missing (useful for format
1333 # selection purposes)
1334 if 'protocol' not in format:
1335 format['protocol'] = determine_protocol(format)
1336 # Add HTTP headers, so that external programs can use them from the
1337 # json output
1338 full_format_info = info_dict.copy()
1339 full_format_info.update(format)
1340 format['http_headers'] = self._calc_headers(full_format_info)
1341
1342 # TODO Central sorting goes here
1343
1344 if formats[0] is not info_dict:
1345 # only set the 'formats' fields if the original info_dict list them
1346 # otherwise we end up with a circular reference, the first (and unique)
1347 # element in the 'formats' field in info_dict is info_dict itself,
1348 # which can't be exported to json
1349 info_dict['formats'] = formats
1350 if self.params.get('listformats'):
1351 self.list_formats(info_dict)
1352 return
1353
1354 req_format = self.params.get('format')
1355 if req_format is None:
1356 req_format_list = []
1357 if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
1358 not info_dict.get('is_live')):
1359 merger = FFmpegMergerPP(self)
1360 if merger.available and merger.can_merge():
1361 req_format_list.append('bestvideo+bestaudio')
1362 req_format_list.append('best')
1363 req_format = '/'.join(req_format_list)
1364 format_selector = self.build_format_selector(req_format)
1365 formats_to_download = list(format_selector(formats))
1366 if not formats_to_download:
1367 raise ExtractorError('requested format not available',
1368 expected=True)
1369
1370 if download:
1371 if len(formats_to_download) > 1:
1372 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1373 for format in formats_to_download:
1374 new_info = dict(info_dict)
1375 new_info.update(format)
1376 self.process_info(new_info)
1377 # We update the info dict with the best quality format (backwards compatibility)
1378 info_dict.update(formats_to_download[-1])
1379 return info_dict
1380
1381 def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1382 """Select the requested subtitles and their format"""
1383 available_subs = {}
1384 if normal_subtitles and self.params.get('writesubtitles'):
1385 available_subs.update(normal_subtitles)
1386 if automatic_captions and self.params.get('writeautomaticsub'):
1387 for lang, cap_info in automatic_captions.items():
1388 if lang not in available_subs:
1389 available_subs[lang] = cap_info
1390
1391 if (not self.params.get('writesubtitles') and not
1392 self.params.get('writeautomaticsub') or not
1393 available_subs):
1394 return None
1395
1396 if self.params.get('allsubtitles', False):
1397 requested_langs = available_subs.keys()
1398 else:
1399 if self.params.get('subtitleslangs', False):
1400 requested_langs = self.params.get('subtitleslangs')
1401 elif 'en' in available_subs:
1402 requested_langs = ['en']
1403 else:
1404 requested_langs = [list(available_subs.keys())[0]]
1405
1406 formats_query = self.params.get('subtitlesformat', 'best')
1407 formats_preference = formats_query.split('/') if formats_query else []
1408 subs = {}
1409 for lang in requested_langs:
1410 formats = available_subs.get(lang)
1411 if formats is None:
1412 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1413 continue
1414 for ext in formats_preference:
1415 if ext == 'best':
1416 f = formats[-1]
1417 break
1418 matches = list(filter(lambda f: f['ext'] == ext, formats))
1419 if matches:
1420 f = matches[-1]
1421 break
1422 else:
1423 f = formats[-1]
1424 self.report_warning(
1425 'No subtitle format found matching "%s" for language %s, '
1426 'using %s' % (formats_query, lang, f['ext']))
1427 subs[lang] = f
1428 return subs
1429
1430 def process_info(self, info_dict):
1431 """Process a single resolved IE result."""
1432
1433 assert info_dict.get('_type', 'video') == 'video'
1434
1435 max_downloads = self.params.get('max_downloads')
1436 if max_downloads is not None:
1437 if self._num_downloads >= int(max_downloads):
1438 raise MaxDownloadsReached()
1439
1440 info_dict['fulltitle'] = info_dict['title']
1441 if len(info_dict['title']) > 200:
1442 info_dict['title'] = info_dict['title'][:197] + '...'
1443
1444 if 'format' not in info_dict:
1445 info_dict['format'] = info_dict['ext']
1446
1447 reason = self._match_entry(info_dict, incomplete=False)
1448 if reason is not None:
1449 self.to_screen('[download] ' + reason)
1450 return
1451
1452 self._num_downloads += 1
1453
1454 info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1455
1456 # Forced printings
1457 if self.params.get('forcetitle', False):
1458 self.to_stdout(info_dict['fulltitle'])
1459 if self.params.get('forceid', False):
1460 self.to_stdout(info_dict['id'])
1461 if self.params.get('forceurl', False):
1462 if info_dict.get('requested_formats') is not None:
1463 for f in info_dict['requested_formats']:
1464 self.to_stdout(f['url'] + f.get('play_path', ''))
1465 else:
1466 # For RTMP URLs, also include the playpath
1467 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1468 if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1469 self.to_stdout(info_dict['thumbnail'])
1470 if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1471 self.to_stdout(info_dict['description'])
1472 if self.params.get('forcefilename', False) and filename is not None:
1473 self.to_stdout(filename)
1474 if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1475 self.to_stdout(formatSeconds(info_dict['duration']))
1476 if self.params.get('forceformat', False):
1477 self.to_stdout(info_dict['format'])
1478 if self.params.get('forcejson', False):
1479 self.to_stdout(json.dumps(info_dict))
1480
1481 # Do nothing else if in simulate mode
1482 if self.params.get('simulate', False):
1483 return
1484
1485 if filename is None:
1486 return
1487
1488 try:
1489 dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
1490 if dn and not os.path.exists(dn):
1491 os.makedirs(dn)
1492 except (OSError, IOError) as err:
1493 self.report_error('unable to create directory ' + error_to_compat_str(err))
1494 return
1495
1496 if self.params.get('writedescription', False):
1497 descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1498 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1499 self.to_screen('[info] Video description is already present')
1500 elif info_dict.get('description') is None:
1501 self.report_warning('There\'s no description to write.')
1502 else:
1503 try:
1504 self.to_screen('[info] Writing video description to: ' + descfn)
1505 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1506 descfile.write(info_dict['description'])
1507 except (OSError, IOError):
1508 self.report_error('Cannot write description file ' + descfn)
1509 return
1510
1511 if self.params.get('writeannotations', False):
1512 annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1513 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1514 self.to_screen('[info] Video annotations are already present')
1515 else:
1516 try:
1517 self.to_screen('[info] Writing video annotations to: ' + annofn)
1518 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1519 annofile.write(info_dict['annotations'])
1520 except (KeyError, TypeError):
1521 self.report_warning('There are no annotations to write.')
1522 except (OSError, IOError):
1523 self.report_error('Cannot write annotations file: ' + annofn)
1524 return
1525
1526 subtitles_are_requested = any([self.params.get('writesubtitles', False),
1527 self.params.get('writeautomaticsub')])
1528
1529 if subtitles_are_requested and info_dict.get('requested_subtitles'):
1530 # subtitles download errors are already managed as troubles in relevant IE
1531 # that way it will silently go on when used with unsupporting IE
1532 subtitles = info_dict['requested_subtitles']
1533 ie = self.get_info_extractor(info_dict['extractor_key'])
1534 for sub_lang, sub_info in subtitles.items():
1535 sub_format = sub_info['ext']
1536 if sub_info.get('data') is not None:
1537 sub_data = sub_info['data']
1538 else:
1539 try:
1540 sub_data = ie._download_webpage(
1541 sub_info['url'], info_dict['id'], note=False)
1542 except ExtractorError as err:
1543 self.report_warning('Unable to download subtitle for "%s": %s' %
1544 (sub_lang, error_to_compat_str(err.cause)))
1545 continue
1546 try:
1547 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1548 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1549 self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
1550 else:
1551 self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1552 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
1553 subfile.write(sub_data)
1554 except (OSError, IOError):
1555 self.report_error('Cannot write subtitles file ' + sub_filename)
1556 return
1557
1558 if self.params.get('writeinfojson', False):
1559 infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1560 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1561 self.to_screen('[info] Video description metadata is already present')
1562 else:
1563 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1564 try:
1565 write_json_file(self.filter_requested_info(info_dict), infofn)
1566 except (OSError, IOError):
1567 self.report_error('Cannot write metadata to JSON file ' + infofn)
1568 return
1569
1570 self._write_thumbnails(info_dict, filename)
1571
1572 if not self.params.get('skip_download', False):
1573 try:
1574 def dl(name, info):
1575 fd = get_suitable_downloader(info, self.params)(self, self.params)
1576 for ph in self._progress_hooks:
1577 fd.add_progress_hook(ph)
1578 if self.params.get('verbose'):
1579 self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1580 return fd.download(name, info)
1581
1582 if info_dict.get('requested_formats') is not None:
1583 downloaded = []
1584 success = True
1585 merger = FFmpegMergerPP(self)
1586 if not merger.available:
1587 postprocessors = []
1588 self.report_warning('You have requested multiple '
1589 'formats but ffmpeg or avconv are not installed.'
1590 ' The formats won\'t be merged.')
1591 else:
1592 postprocessors = [merger]
1593
1594 def compatible_formats(formats):
1595 video, audio = formats
1596 # Check extension
1597 video_ext, audio_ext = audio.get('ext'), video.get('ext')
1598 if video_ext and audio_ext:
1599 COMPATIBLE_EXTS = (
1600 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
1601 ('webm')
1602 )
1603 for exts in COMPATIBLE_EXTS:
1604 if video_ext in exts and audio_ext in exts:
1605 return True
1606 # TODO: Check acodec/vcodec
1607 return False
1608
1609 filename_real_ext = os.path.splitext(filename)[1][1:]
1610 filename_wo_ext = (
1611 os.path.splitext(filename)[0]
1612 if filename_real_ext == info_dict['ext']
1613 else filename)
1614 requested_formats = info_dict['requested_formats']
1615 if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1616 info_dict['ext'] = 'mkv'
1617 self.report_warning(
1618 'Requested formats are incompatible for merge and will be merged into mkv.')
1619 # Ensure filename always has a correct extension for successful merge
1620 filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1621 if os.path.exists(encodeFilename(filename)):
1622 self.to_screen(
1623 '[download] %s has already been downloaded and '
1624 'merged' % filename)
1625 else:
1626 for f in requested_formats:
1627 new_info = dict(info_dict)
1628 new_info.update(f)
1629 fname = self.prepare_filename(new_info)
1630 fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
1631 downloaded.append(fname)
1632 partial_success = dl(fname, new_info)
1633 success = success and partial_success
1634 info_dict['__postprocessors'] = postprocessors
1635 info_dict['__files_to_merge'] = downloaded
1636 else:
1637 # Just a single file
1638 success = dl(filename, info_dict)
1639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1640 self.report_error('unable to download video data: %s' % str(err))
1641 return
1642 except (OSError, IOError) as err:
1643 raise UnavailableVideoError(err)
1644 except (ContentTooShortError, ) as err:
1645 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1646 return
1647
1648 if success and filename != '-':
1649 # Fixup content
1650 fixup_policy = self.params.get('fixup')
1651 if fixup_policy is None:
1652 fixup_policy = 'detect_or_warn'
1653
1654 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1655
1656 stretched_ratio = info_dict.get('stretched_ratio')
1657 if stretched_ratio is not None and stretched_ratio != 1:
1658 if fixup_policy == 'warn':
1659 self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1660 info_dict['id'], stretched_ratio))
1661 elif fixup_policy == 'detect_or_warn':
1662 stretched_pp = FFmpegFixupStretchedPP(self)
1663 if stretched_pp.available:
1664 info_dict.setdefault('__postprocessors', [])
1665 info_dict['__postprocessors'].append(stretched_pp)
1666 else:
1667 self.report_warning(
1668 '%s: Non-uniform pixel ratio (%s). %s'
1669 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1670 else:
1671 assert fixup_policy in ('ignore', 'never')
1672
1673 if (info_dict.get('requested_formats') is None and
1674 info_dict.get('container') == 'm4a_dash'):
1675 if fixup_policy == 'warn':
1676 self.report_warning(
1677 '%s: writing DASH m4a. '
1678 'Only some players support this container.'
1679 % info_dict['id'])
1680 elif fixup_policy == 'detect_or_warn':
1681 fixup_pp = FFmpegFixupM4aPP(self)
1682 if fixup_pp.available:
1683 info_dict.setdefault('__postprocessors', [])
1684 info_dict['__postprocessors'].append(fixup_pp)
1685 else:
1686 self.report_warning(
1687 '%s: writing DASH m4a. '
1688 'Only some players support this container. %s'
1689 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1690 else:
1691 assert fixup_policy in ('ignore', 'never')
1692
1693 if (info_dict.get('protocol') == 'm3u8_native' or
1694 info_dict.get('protocol') == 'm3u8' and
1695 self.params.get('hls_prefer_native')):
1696 if fixup_policy == 'warn':
1697 self.report_warning('%s: malformated aac bitstream.' % (
1698 info_dict['id']))
1699 elif fixup_policy == 'detect_or_warn':
1700 fixup_pp = FFmpegFixupM3u8PP(self)
1701 if fixup_pp.available:
1702 info_dict.setdefault('__postprocessors', [])
1703 info_dict['__postprocessors'].append(fixup_pp)
1704 else:
1705 self.report_warning(
1706 '%s: malformated aac bitstream. %s'
1707 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1708 else:
1709 assert fixup_policy in ('ignore', 'never')
1710
1711 try:
1712 self.post_process(filename, info_dict)
1713 except (PostProcessingError) as err:
1714 self.report_error('postprocessing: %s' % str(err))
1715 return
1716 self.record_download_archive(info_dict)
1717
1718 def download(self, url_list):
1719 """Download a given list of URLs."""
1720 outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1721 if (len(url_list) > 1 and
1722 '%' not in outtmpl and
1723 self.params.get('max_downloads') != 1):
1724 raise SameFileError(outtmpl)
1725
1726 for url in url_list:
1727 try:
1728 # It also downloads the videos
1729 res = self.extract_info(
1730 url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1731 except UnavailableVideoError:
1732 self.report_error('unable to download video')
1733 except MaxDownloadsReached:
1734 self.to_screen('[info] Maximum number of downloaded files reached.')
1735 raise
1736 else:
1737 if self.params.get('dump_single_json', False):
1738 self.to_stdout(json.dumps(res))
1739
1740 return self._download_retcode
1741
1742 def download_with_info_file(self, info_filename):
1743 with contextlib.closing(fileinput.FileInput(
1744 [info_filename], mode='r',
1745 openhook=fileinput.hook_encoded('utf-8'))) as f:
1746 # FileInput doesn't have a read method, we can't call json.load
1747 info = self.filter_requested_info(json.loads('\n'.join(f)))
1748 try:
1749 self.process_ie_result(info, download=True)
1750 except DownloadError:
1751 webpage_url = info.get('webpage_url')
1752 if webpage_url is not None:
1753 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
1754 return self.download([webpage_url])
1755 else:
1756 raise
1757 return self._download_retcode
1758
1759 @staticmethod
1760 def filter_requested_info(info_dict):
1761 return dict(
1762 (k, v) for k, v in info_dict.items()
1763 if k not in ['requested_formats', 'requested_subtitles'])
1764
1765 def post_process(self, filename, ie_info):
1766 """Run all the postprocessors on the given file."""
1767 info = dict(ie_info)
1768 info['filepath'] = filename
1769 pps_chain = []
1770 if ie_info.get('__postprocessors') is not None:
1771 pps_chain.extend(ie_info['__postprocessors'])
1772 pps_chain.extend(self._pps)
1773 for pp in pps_chain:
1774 files_to_delete = []
1775 try:
1776 files_to_delete, info = pp.run(info)
1777 except PostProcessingError as e:
1778 self.report_error(e.msg)
1779 if files_to_delete and not self.params.get('keepvideo', False):
1780 for old_filename in files_to_delete:
1781 self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
1782 try:
1783 os.remove(encodeFilename(old_filename))
1784 except (IOError, OSError):
1785 self.report_warning('Unable to remove downloaded original file')
1786
1787 def _make_archive_id(self, info_dict):
1788 # Future-proof against any change in case
1789 # and backwards compatibility with prior versions
1790 extractor = info_dict.get('extractor_key')
1791 if extractor is None:
1792 if 'id' in info_dict:
1793 extractor = info_dict.get('ie_key') # key in a playlist
1794 if extractor is None:
1795 return None # Incomplete video information
1796 return extractor.lower() + ' ' + info_dict['id']
1797
1798 def in_download_archive(self, info_dict):
1799 fn = self.params.get('download_archive')
1800 if fn is None:
1801 return False
1802
1803 vid_id = self._make_archive_id(info_dict)
1804 if vid_id is None:
1805 return False # Incomplete video information
1806
1807 try:
1808 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1809 for line in archive_file:
1810 if line.strip() == vid_id:
1811 return True
1812 except IOError as ioe:
1813 if ioe.errno != errno.ENOENT:
1814 raise
1815 return False
1816
1817 def record_download_archive(self, info_dict):
1818 fn = self.params.get('download_archive')
1819 if fn is None:
1820 return
1821 vid_id = self._make_archive_id(info_dict)
1822 assert vid_id
1823 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1824 archive_file.write(vid_id + '\n')
1825
1826 @staticmethod
1827 def format_resolution(format, default='unknown'):
1828 if format.get('vcodec') == 'none':
1829 return 'audio only'
1830 if format.get('resolution') is not None:
1831 return format['resolution']
1832 if format.get('height') is not None:
1833 if format.get('width') is not None:
1834 res = '%sx%s' % (format['width'], format['height'])
1835 else:
1836 res = '%sp' % format['height']
1837 elif format.get('width') is not None:
1838 res = '%dx?' % format['width']
1839 else:
1840 res = default
1841 return res
1842
1843 def _format_note(self, fdict):
1844 res = ''
1845 if fdict.get('ext') in ['f4f', 'f4m']:
1846 res += '(unsupported) '
1847 if fdict.get('language'):
1848 if res:
1849 res += ' '
1850 res += '[%s] ' % fdict['language']
1851 if fdict.get('format_note') is not None:
1852 res += fdict['format_note'] + ' '
1853 if fdict.get('tbr') is not None:
1854 res += '%4dk ' % fdict['tbr']
1855 if fdict.get('container') is not None:
1856 if res:
1857 res += ', '
1858 res += '%s container' % fdict['container']
1859 if (fdict.get('vcodec') is not None and
1860 fdict.get('vcodec') != 'none'):
1861 if res:
1862 res += ', '
1863 res += fdict['vcodec']
1864 if fdict.get('vbr') is not None:
1865 res += '@'
1866 elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1867 res += 'video@'
1868 if fdict.get('vbr') is not None:
1869 res += '%4dk' % fdict['vbr']
1870 if fdict.get('fps') is not None:
1871 if res:
1872 res += ', '
1873 res += '%sfps' % fdict['fps']
1874 if fdict.get('acodec') is not None:
1875 if res:
1876 res += ', '
1877 if fdict['acodec'] == 'none':
1878 res += 'video only'
1879 else:
1880 res += '%-5s' % fdict['acodec']
1881 elif fdict.get('abr') is not None:
1882 if res:
1883 res += ', '
1884 res += 'audio'
1885 if fdict.get('abr') is not None:
1886 res += '@%3dk' % fdict['abr']
1887 if fdict.get('asr') is not None:
1888 res += ' (%5dHz)' % fdict['asr']
1889 if fdict.get('filesize') is not None:
1890 if res:
1891 res += ', '
1892 res += format_bytes(fdict['filesize'])
1893 elif fdict.get('filesize_approx') is not None:
1894 if res:
1895 res += ', '
1896 res += '~' + format_bytes(fdict['filesize_approx'])
1897 return res
1898
1899 def list_formats(self, info_dict):
1900 formats = info_dict.get('formats', [info_dict])
1901 table = [
1902 [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
1903 for f in formats
1904 if f.get('preference') is None or f['preference'] >= -1000]
1905 if len(formats) > 1:
1906 table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
1907
1908 header_line = ['format code', 'extension', 'resolution', 'note']
1909 self.to_screen(
1910 '[info] Available formats for %s:\n%s' %
1911 (info_dict['id'], render_table(header_line, table)))
1912
1913 def list_thumbnails(self, info_dict):
1914 thumbnails = info_dict.get('thumbnails')
1915 if not thumbnails:
1916 self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
1917 return
1918
1919 self.to_screen(
1920 '[info] Thumbnails for %s:' % info_dict['id'])
1921 self.to_screen(render_table(
1922 ['ID', 'width', 'height', 'URL'],
1923 [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
1924
1925 def list_subtitles(self, video_id, subtitles, name='subtitles'):
1926 if not subtitles:
1927 self.to_screen('%s has no %s' % (video_id, name))
1928 return
1929 self.to_screen(
1930 'Available %s for %s:' % (name, video_id))
1931 self.to_screen(render_table(
1932 ['Language', 'formats'],
1933 [[lang, ', '.join(f['ext'] for f in reversed(formats))]
1934 for lang, formats in subtitles.items()]))
1935
1936 def urlopen(self, req):
1937 """ Start an HTTP download """
1938 if isinstance(req, compat_basestring):
1939 req = sanitized_Request(req)
1940 return self._opener.open(req, timeout=self._socket_timeout)
1941
1942 def print_debug_header(self):
1943 if not self.params.get('verbose'):
1944 return
1945
1946 if type('') is not compat_str:
1947 # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
1948 self.report_warning(
1949 'Your Python is broken! Update to a newer and supported version')
1950
1951 stdout_encoding = getattr(
1952 sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
1953 encoding_str = (
1954 '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
1955 locale.getpreferredencoding(),
1956 sys.getfilesystemencoding(),
1957 stdout_encoding,
1958 self.get_encoding()))
1959 write_string(encoding_str, encoding=None)
1960
1961 self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
1962 if _LAZY_LOADER:
1963 self._write_string('[debug] Lazy loading extractors enabled' + '\n')
1964 try:
1965 sp = subprocess.Popen(
1966 ['git', 'rev-parse', '--short', 'HEAD'],
1967 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1968 cwd=os.path.dirname(os.path.abspath(__file__)))
1969 out, err = sp.communicate()
1970 out = out.decode().strip()
1971 if re.match('[0-9a-f]+', out):
1972 self._write_string('[debug] Git HEAD: ' + out + '\n')
1973 except Exception:
1974 try:
1975 sys.exc_clear()
1976 except Exception:
1977 pass
1978 self._write_string('[debug] Python version %s - %s\n' % (
1979 platform.python_version(), platform_name()))
1980
1981 exe_versions = FFmpegPostProcessor.get_versions(self)
1982 exe_versions['rtmpdump'] = rtmpdump_version()
1983 exe_str = ', '.join(
1984 '%s %s' % (exe, v)
1985 for exe, v in sorted(exe_versions.items())
1986 if v
1987 )
1988 if not exe_str:
1989 exe_str = 'none'
1990 self._write_string('[debug] exe versions: %s\n' % exe_str)
1991
1992 proxy_map = {}
1993 for handler in self._opener.handlers:
1994 if hasattr(handler, 'proxies'):
1995 proxy_map.update(handler.proxies)
1996 self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1997
1998 if self.params.get('call_home', False):
1999 ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2000 self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2001 latest_version = self.urlopen(
2002 'https://yt-dl.org/latest/version').read().decode('utf-8')
2003 if version_tuple(latest_version) > version_tuple(__version__):
2004 self.report_warning(
2005 'You are using an outdated version (newest version: %s)! '
2006 'See https://yt-dl.org/update if you need help updating.' %
2007 latest_version)
2008
2009 def _setup_opener(self):
2010 timeout_val = self.params.get('socket_timeout')
2011 self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2012
2013 opts_cookiefile = self.params.get('cookiefile')
2014 opts_proxy = self.params.get('proxy')
2015
2016 if opts_cookiefile is None:
2017 self.cookiejar = compat_cookiejar.CookieJar()
2018 else:
2019 self.cookiejar = compat_cookiejar.MozillaCookieJar(
2020 opts_cookiefile)
2021 if os.access(opts_cookiefile, os.R_OK):
2022 self.cookiejar.load()
2023
2024 cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2025 if opts_proxy is not None:
2026 if opts_proxy == '':
2027 proxies = {}
2028 else:
2029 proxies = {'http': opts_proxy, 'https': opts_proxy}
2030 else:
2031 proxies = compat_urllib_request.getproxies()
2032 # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2033 if 'http' in proxies and 'https' not in proxies:
2034 proxies['https'] = proxies['http']
2035 proxy_handler = PerRequestProxyHandler(proxies)
2036
2037 debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2038 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2039 ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2040 data_handler = compat_urllib_request_DataHandler()
2041
2042 # When passing our own FileHandler instance, build_opener won't add the
2043 # default FileHandler and allows us to disable the file protocol, which
2044 # can be used for malicious purposes (see
2045 # https://github.com/rg3/youtube-dl/issues/8227)
2046 file_handler = compat_urllib_request.FileHandler()
2047
2048 def file_open(*args, **kwargs):
2049 raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2050 file_handler.file_open = file_open
2051
2052 opener = compat_urllib_request.build_opener(
2053 proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2054
2055 # Delete the default user-agent header, which would otherwise apply in
2056 # cases where our custom HTTP handler doesn't come into play
2057 # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2058 opener.addheaders = []
2059 self._opener = opener
2060
2061 def encode(self, s):
2062 if isinstance(s, bytes):
2063 return s # Already encoded
2064
2065 try:
2066 return s.encode(self.get_encoding())
2067 except UnicodeEncodeError as err:
2068 err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2069 raise
2070
2071 def get_encoding(self):
2072 encoding = self.params.get('encoding')
2073 if encoding is None:
2074 encoding = preferredencoding()
2075 return encoding
2076
2077 def _write_thumbnails(self, info_dict, filename):
2078 if self.params.get('writethumbnail', False):
2079 thumbnails = info_dict.get('thumbnails')
2080 if thumbnails:
2081 thumbnails = [thumbnails[-1]]
2082 elif self.params.get('write_all_thumbnails', False):
2083 thumbnails = info_dict.get('thumbnails')
2084 else:
2085 return
2086
2087 if not thumbnails:
2088 # No thumbnails present, so return immediately
2089 return
2090
2091 for t in thumbnails:
2092 thumb_ext = determine_ext(t['url'], 'jpg')
2093 suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2094 thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2095 t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2096
2097 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2098 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2099 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2100 else:
2101 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2102 (info_dict['extractor'], info_dict['id'], thumb_display_id))
2103 try:
2104 uf = self.urlopen(t['url'])
2105 with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2106 shutil.copyfileobj(uf, thumbf)
2107 self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2108 (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2109 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2110 self.report_warning('Unable to download thumbnail "%s": %s' %
2111 (t['url'], error_to_compat_str(err)))