simplejson/encoder.py

   1 """
   2 Implementation of JSONEncoder
   3 """
   4 import re
   5
   6 try:
   7     from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii
   8 except ImportError:
   9     pass
  10
  11 ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
  12 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
  13 HAS_UTF8 = re.compile(r'[\x80-\xff]')
  14 ESCAPE_DCT = {
  15     '\\': '\\\\',
  16     '"': '\\"',
  17     '\b': '\\b',
  18     '\f': '\\f',
  19     '\n': '\\n',
  20     '\r': '\\r',
  21     '\t': '\\t',
  22 }
  23 for i in range(0x20):
  24     ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
  25
  26 # Assume this produces an infinity on all machines (probably not guaranteed)
  27 INFINITY = float('1e66666')
  28 FLOAT_REPR = repr
  29
  30 def floatstr(o, allow_nan=True):
  31     # Check for specials.  Note that this type of test is processor- and/or
  32     # platform-specific, so do tests which don't depend on the internals.
  33
  34     if o != o:
  35         text = 'NaN'
  36     elif o == INFINITY:
  37         text = 'Infinity'
  38     elif o == -INFINITY:
  39         text = '-Infinity'
  40     else:
  41         return FLOAT_REPR(o)
  42
  43     if not allow_nan:
  44         raise ValueError("Out of range float values are not JSON compliant: %r"
  45             % (o,))
  46
  47     return text
  48
  49
  50 def encode_basestring(s):
  51     """
  52     Return a JSON representation of a Python string
  53     """
  54     def replace(match):
  55         return ESCAPE_DCT[match.group(0)]
  56     return '"' + ESCAPE.sub(replace, s) + '"'
  57
  58
  59 def py_encode_basestring_ascii(s):
  60     if isinstance(s, str) and HAS_UTF8.search(s) is not None:
  61         s = s.decode('utf-8')
  62     def replace(match):
  63         s = match.group(0)
  64         try:
  65             return ESCAPE_DCT[s]
  66         except KeyError:
  67             n = ord(s)
  68             if n < 0x10000:
  69                 return '\\u%04x' % (n,)
  70             else:
  71                 # surrogate pair
  72                 n -= 0x10000
  73                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
  74                 s2 = 0xdc00 | (n & 0x3ff)
  75                 return '\\u%04x\\u%04x' % (s1, s2)
  76     return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
  77
  78
  79 try:
  80     encode_basestring_ascii = c_encode_basestring_ascii
  81 except NameError:
  82     encode_basestring_ascii = py_encode_basestring_ascii
  83
  84
  85 class JSONEncoder(object):
  86     """
  87     Extensible JSON <http://json.org> encoder for Python data structures.
  88
  89     Supports the following objects and types by default:
  90
  91     +-------------------+---------------+
  92     | Python            | JSON          |
  93     +===================+===============+
  94     | dict              | object        |
  95     +-------------------+---------------+
  96     | list, tuple       | array         |
  97     +-------------------+---------------+
  98     | str, unicode      | string        |
  99     +-------------------+---------------+
 100     | int, long, float  | number        |
 101     +-------------------+---------------+
 102     | True              | true          |
 103     +-------------------+---------------+
 104     | False             | false         |
 105     +-------------------+---------------+
 106     | None              | null          |
 107     +-------------------+---------------+
 108
 109     To extend this to recognize other objects, subclass and implement a
 110     ``.default()`` method with another method that returns a serializable
 111     object for ``o`` if possible, otherwise it should call the superclass
 112     implementation (to raise ``TypeError``).
 113     """
 114     __all__ = ['__init__', 'default', 'encode', 'iterencode']
 115     item_separator = ', '
 116     key_separator = ': '
 117     def __init__(self, skipkeys=False, ensure_ascii=True,
 118             check_circular=True, allow_nan=True, sort_keys=False,
 119             indent=None, separators=None, encoding='utf-8', default=None):
 120         """
 121         Constructor for JSONEncoder, with sensible defaults.
 122
 123         If skipkeys is False, then it is a TypeError to attempt
 124         encoding of keys that are not str, int, long, float or None.  If
 125         skipkeys is True, such items are simply skipped.
 126
 127         If ensure_ascii is True, the output is guaranteed to be str
 128         objects with all incoming unicode characters escaped.  If
 129         ensure_ascii is false, the output will be unicode object.
 130
 131         If check_circular is True, then lists, dicts, and custom encoded
 132         objects will be checked for circular references during encoding to
 133         prevent an infinite recursion (which would cause an OverflowError).
 134         Otherwise, no such check takes place.
 135
 136         If allow_nan is True, then NaN, Infinity, and -Infinity will be
 137         encoded as such.  This behavior is not JSON specification compliant,
 138         but is consistent with most JavaScript based encoders and decoders.
 139         Otherwise, it will be a ValueError to encode such floats.
 140
 141         If sort_keys is True, then the output of dictionaries will be
 142         sorted by key; this is useful for regression tests to ensure
 143         that JSON serializations can be compared on a day-to-day basis.
 144
 145         If indent is a non-negative integer, then JSON array
 146         elements and object members will be pretty-printed with that
 147         indent level.  An indent level of 0 will only insert newlines.
 148         None is the most compact representation.
 149
 150         If specified, separators should be a (item_separator, key_separator)
 151         tuple.  The default is (', ', ': ').  To get the most compact JSON
 152         representation you should specify (',', ':') to eliminate whitespace.
 153
 154         If specified, default is a function that gets called for objects
 155         that can't otherwise be serialized.  It should return a JSON encodable
 156         version of the object or raise a ``TypeError``.
 157
 158         If encoding is not None, then all input strings will be
 159         transformed into unicode using that encoding prior to JSON-encoding.
 160         The default is UTF-8.
 161         """
 162
 163         self.skipkeys = skipkeys
 164         self.ensure_ascii = ensure_ascii
 165         self.check_circular = check_circular
 166         self.allow_nan = allow_nan
 167         self.sort_keys = sort_keys
 168         self.indent = indent
 169         self.current_indent_level = 0
 170         if separators is not None:
 171             self.item_separator, self.key_separator = separators
 172         if default is not None:
 173             self.default = default
 174         self.encoding = encoding
 175
 176     def _newline_indent(self):
 177         return '\n' + (' ' * (self.indent * self.current_indent_level))
 178
 179     def _iterencode_list(self, lst, markers=None):
 180         if not lst:
 181             yield '[]'
 182             return
 183         if markers is not None:
 184             markerid = id(lst)
 185             if markerid in markers:
 186                 raise ValueError("Circular reference detected")
 187             markers[markerid] = lst
 188         yield '['
 189         if self.indent is not None:
 190             self.current_indent_level += 1
 191             newline_indent = self._newline_indent()
 192             separator = self.item_separator + newline_indent
 193             yield newline_indent
 194         else:
 195             newline_indent = None
 196             separator = self.item_separator
 197         first = True
 198         for value in lst:
 199             if first:
 200                 first = False
 201             else:
 202                 yield separator
 203             for chunk in self._iterencode(value, markers):
 204                 yield chunk
 205         if newline_indent is not None:
 206             self.current_indent_level -= 1
 207             yield self._newline_indent()
 208         yield ']'
 209         if markers is not None:
 210             del markers[markerid]
 211
 212     def _iterencode_dict(self, dct, markers=None):
 213         if not dct:
 214             yield '{}'
 215             return
 216         if markers is not None:
 217             markerid = id(dct)
 218             if markerid in markers:
 219                 raise ValueError("Circular reference detected")
 220             markers[markerid] = dct
 221         yield '{'
 222         key_separator = self.key_separator
 223         if self.indent is not None:
 224             self.current_indent_level += 1
 225             newline_indent = self._newline_indent()
 226             item_separator = self.item_separator + newline_indent
 227             yield newline_indent
 228         else:
 229             newline_indent = None
 230             item_separator = self.item_separator
 231         first = True
 232         if self.ensure_ascii:
 233             encoder = encode_basestring_ascii
 234         else:
 235             encoder = encode_basestring
 236         allow_nan = self.allow_nan
 237         if self.sort_keys:
 238             keys = dct.keys()
 239             keys.sort()
 240             items = [(k, dct[k]) for k in keys]
 241         else:
 242             items = dct.iteritems()
 243         _encoding = self.encoding
 244         _do_decode = (_encoding is not None
 245             and not (_encoding == 'utf-8'))
 246         for key, value in items:
 247             if isinstance(key, str):
 248                 if _do_decode:
 249                     key = key.decode(_encoding)
 250             elif isinstance(key, basestring):
 251                 pass
 252             # JavaScript is weakly typed for these, so it makes sense to
 253             # also allow them.  Many encoders seem to do something like this.
 254             elif isinstance(key, float):
 255                 key = floatstr(key, allow_nan)
 256             elif isinstance(key, (int, long)):
 257                 key = str(key)
 258             elif key is True:
 259                 key = 'true'
 260             elif key is False:
 261                 key = 'false'
 262             elif key is None:
 263                 key = 'null'
 264             elif self.skipkeys:
 265                 continue
 266             else:
 267                 raise TypeError("key %r is not a string" % (key,))
 268             if first:
 269                 first = False
 270             else:
 271                 yield item_separator
 272             yield encoder(key)
 273             yield key_separator
 274             for chunk in self._iterencode(value, markers):
 275                 yield chunk
 276         if newline_indent is not None:
 277             self.current_indent_level -= 1
 278             yield self._newline_indent()
 279         yield '}'
 280         if markers is not None:
 281             del markers[markerid]
 282
 283     def _iterencode(self, o, markers=None):
 284         if isinstance(o, basestring):
 285             if self.ensure_ascii:
 286                 encoder = encode_basestring_ascii
 287             else:
 288                 encoder = encode_basestring
 289             _encoding = self.encoding
 290             if (_encoding is not None and isinstance(o, str)
 291                     and not (_encoding == 'utf-8')):
 292                 o = o.decode(_encoding)
 293             yield encoder(o)
 294         elif o is None:
 295             yield 'null'
 296         elif o is True:
 297             yield 'true'
 298         elif o is False:
 299             yield 'false'
 300         elif isinstance(o, (int, long)):
 301             yield str(o)
 302         elif isinstance(o, float):
 303             yield floatstr(o, self.allow_nan)
 304         elif isinstance(o, (list, tuple)):
 305             for chunk in self._iterencode_list(o, markers):
 306                 yield chunk
 307         elif isinstance(o, dict):
 308             for chunk in self._iterencode_dict(o, markers):
 309                 yield chunk
 310         else:
 311             if markers is not None:
 312                 markerid = id(o)
 313                 if markerid in markers:
 314                     raise ValueError("Circular reference detected")
 315                 markers[markerid] = o
 316             for chunk in self._iterencode_default(o, markers):
 317                 yield chunk
 318             if markers is not None:
 319                 del markers[markerid]
 320
 321     def _iterencode_default(self, o, markers=None):
 322         newobj = self.default(o)
 323         return self._iterencode(newobj, markers)
 324
 325     def default(self, o):
 326         """
 327         Implement this method in a subclass such that it returns
 328         a serializable object for ``o``, or calls the base implementation
 329         (to raise a ``TypeError``).
 330
 331         For example, to support arbitrary iterators, you could
 332         implement default like this::
 333
 334             def default(self, o):
 335                 try:
 336                     iterable = iter(o)
 337                 except TypeError:
 338                     pass
 339                 else:
 340                     return list(iterable)
 341                 return JSONEncoder.default(self, o)
 342         """
 343         raise TypeError("%r is not JSON serializable" % (o,))
 344
 345     def encode(self, o):
 346         """
 347         Return a JSON string representation of a Python data structure.
 348
 349         >>> JSONEncoder().encode({"foo": ["bar", "baz"]})
 350         '{"foo": ["bar", "baz"]}'
 351         """
 352         # This is for extremely simple cases and benchmarks.
 353         if isinstance(o, basestring):
 354             if isinstance(o, str):
 355                 _encoding = self.encoding
 356                 if (_encoding is not None
 357                         and not (_encoding == 'utf-8')):
 358                     o = o.decode(_encoding)
 359             if self.ensure_ascii:
 360                 return encode_basestring_ascii(o)
 361             else:
 362                 return encode_basestring(o)
 363         # This doesn't pass the iterator directly to ''.join() because the
 364         # exceptions aren't as detailed.  The list call should be roughly
 365         # equivalent to the PySequence_Fast that ''.join() would do.
 366         chunks = list(self.iterencode(o))
 367         return ''.join(chunks)
 368
 369     def iterencode(self, o):
 370         """
 371         Encode the given object and yield each string
 372         representation as available.
 373
 374         For example::
 375
 376             for chunk in JSONEncoder().iterencode(bigobject):
 377                 mysocket.write(chunk)
 378         """
 379         if self.check_circular:
 380             markers = {}
 381         else:
 382             markers = None
 383         return self._iterencode(o, markers)
 384
 385 __all__ = ['JSONEncoder']