diff --git a/README.md b/README.md index 91ab0c8..f7ce80d 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,32 @@ $ git push --tags origin ## Version History / Release Notes +* v0.10.0 (2024-11-25) + * [GitHub issue #57](https://github.com/dpranke/pyjson5/issues/57). + Added a `JSON5Encoder` class that can be overridden to do custom + encoding of values. This class is vaguely similar to the `JSONEncoder` + class in the standard `json` library, except that it has an + `encode()` method that can be overridden to customize *any* + value, not just ones the standard encoder doesn't know how to handle. + It does also support a `default()` method that can be used to + encode things not normally encodable, like the JSONEncoder class. + It does not support an `iterencode` method. One could probably + be added in the future, although exactly how that would work and + interact with `encode` is a little unclear. + * Restructured the code to use the new encoder class; doing so actually + allowed me to delete a bunch of tediously duplicative code. + * Added a new `quote_style` argument to `dump()`/`dumps()` to control + how strings are encoded by default. For compatibility with older + versions of the json5 library and the standard json library, it + uses `QuoteStyle.ALWAYS_DOUBLE` which encodes all strings with double + quotes all the time. You can also configure it to use single quotes + all the time (`ALWAYS_SINGLE`), and to switch between single and double + when doing so eliminates a need to escape quotes (`PREFER_SINGLE` and + `PREFER_DOUBLE`). This also adds a `--quote-style` argument to + `python -m json5`. + * This release has a fair number of changes, but is intended to be + completely backwards-compatible. Code without changes should run exactly + as it did before. * v0.9.28 (2024-11-11) * Fix GitHub CI to install `uv` so `./run tests` works properly. * Mark Python3.13 as supported in package metadata. diff --git a/json5/__init__.py b/json5/__init__.py index 87d3065..b3bb927 100644 --- a/json5/__init__.py +++ b/json5/__init__.py @@ -14,13 +14,15 @@ """A pure Python implementation of the JSON5 configuration language.""" -from .lib import load, loads, dump, dumps -from .version import __version__, VERSION +from json5.lib import JSON5Encoder, QuoteStyle, load, loads, dump, dumps +from json5.version import __version__, VERSION __all__ = [ - '__version__', + 'JSON5Encoder', + 'QuoteStyle', 'VERSION', + '__version__', 'dump', 'dumps', 'load', diff --git a/json5/__main__.py b/json5/__main__.py index e394945..7b35d5a 100644 --- a/json5/__main__.py +++ b/json5/__main__.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys # pragma: no cover +# pragma: no cover -from .tool import main # pragma: no cover +import sys +from json5.tool import main -if __name__ == '__main__': # pragma: no cover + +if __name__ == '__main__': sys.exit(main()) diff --git a/json5/arg_parser.py b/json5/arg_parser.py deleted file mode 100644 index 8cc043e..0000000 --- a/json5/arg_parser.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2015 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - - -class _Bailout(Exception): - pass - - -class ArgumentParser(argparse.ArgumentParser): - SUPPRESS = argparse.SUPPRESS - - def __init__(self, host, prog, desc, **kwargs): - kwargs['prog'] = prog - kwargs['description'] = desc - kwargs['formatter_class'] = argparse.RawDescriptionHelpFormatter - super().__init__(**kwargs) - self._host = host - self.exit_status = None - self.add_argument( - '-V', - '--version', - action='store_true', - help='print the version and exit', - ) - - def parse_args(self, args=None, namespace=None): - try: - rargs = super().parse_args(args=args, namespace=namespace) - except _Bailout: - return None - - return rargs - - def _print_message(self, message, file=None): - self._host.print_(msg=message, stream=file, end='\n') - - def print_help(self, file=None): - self._print_message(message=self.format_help(), file=file) - - def error(self, message, bailout=True): - self.exit(2, f'{self.prog}: error: {message}\n', bailout=bailout) - - def exit(self, status=0, message=None, bailout=True): - self.exit_status = status - if message: - self._print_message(message, file=self._host.stderr) - if bailout: - raise _Bailout() diff --git a/json5/host.py b/json5/host.py index 8c06463..c228f22 100644 --- a/json5/host.py +++ b/json5/host.py @@ -36,10 +36,10 @@ def join(self, *comps): def mkdtemp(self, **kwargs): return tempfile.mkdtemp(**kwargs) - def print_(self, msg='', end='\n', stream=None): - stream = stream or self.stdout - stream.write(str(msg) + end) - stream.flush() + def print(self, msg='', end='\n', file=None): + file = file or self.stdout + file.write(str(msg) + end) + file.flush() def rmtree(self, path): shutil.rmtree(path, ignore_errors=True) diff --git a/json5/lib.py b/json5/lib.py index e5ef7e8..d5c1a94 100644 --- a/json5/lib.py +++ b/json5/lib.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum import math import re from typing import ( @@ -27,14 +28,52 @@ ) import unicodedata -from .parser import Parser +from json5.parser import Parser + + +# Used when encoding keys, below. +_reserved_word_re: Optional[re.Pattern] = None + + +class QuoteStyle(enum.Enum): + """Controls how strings will be quoted during encoding. + + By default, for compatibility with the `json` module and older versions of + `json5`, strings (not being used as keys and that are legal identifiers) + will always be double-quoted, and any double quotes in the string will be + escaped. This is `QuoteStyle.ALWAYS_DOUBLE`. If you pass + `QuoteStyle.ALWAYS_SINGLE`, then strings will always be single-quoted, and + any single quotes in the string will be escaped. If you pass + `QuoteStyle.PREFER_DOUBLE`, then the behavior is the same as ALWAYS_DOUBLE + and strings will be double-quoted *unless* the string contains more double + quotes than single quotes, in which case the string will be single-quoted + and single quotes will be escaped. If you pass `QuoteStyle.PREFER_SINGLE`, + then the behavior is the same as ALWAYS_SINGLE and strings will be + single-quoted *unless* the string contains more single quotes than double + quotes, in which case the string will be double-quoted and any double + quotes will be escaped. + + *Note:* PREFER_DOUBLE and PREFER_SINGLE can impact performance, since in + order to know which encoding to use you have to iterate over the entire + string to count the number of single and double quotes. The codes guesses + at an encoding while doing so, but if it guess wrong, the entire string has + to be re-encoded, which will slow things down. If you are very concerned + about performance (a) you probably shouldn't be using this library in the + first place, because it just isn't very fast, and (b) you should use + ALWAYS_DOUBLE or ALWAYS_SINGLE, which won't have this issue. + """ + + ALWAYS_DOUBLE = 'always_double' + ALWAYS_SINGLE = 'always_single' + PREFER_DOUBLE = 'prefer_double' + PREFER_SINGLE = 'prefer_single' def load( fp: IO, *, encoding: Optional[str] = None, - cls: None = None, + cls: Any = None, object_hook: Optional[Callable[[Mapping[str, Any]], Any]] = None, parse_float: Optional[Callable[[str], Any]] = None, parse_int: Optional[Callable[[str], Any]] = None, @@ -75,7 +114,7 @@ def loads( s: str, *, encoding: Optional[str] = None, - cls: None = None, + cls: Any = None, object_hook: Optional[Callable[[Mapping[str, Any]], Any]] = None, parse_float: Optional[Callable[[str], Any]] = None, parse_int: Optional[Callable[[str], Any]] = None, @@ -192,7 +231,7 @@ def dump( ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, - cls: None = None, + cls: Optional['JSON5Encoder'] = None, indent: Optional[Union[int, str]] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable[[Any], Any]] = None, @@ -200,45 +239,19 @@ def dump( quote_keys: bool = False, trailing_commas: bool = True, allow_duplicate_keys: bool = True, - **kwargs, + quote_style: QuoteStyle = QuoteStyle.ALWAYS_DOUBLE, + **kw, ): """Serialize ``obj`` to a JSON5-formatted stream to ``fp``, a ``.write()``-supporting file-like object. - Supports the same arguments as ``json.dump()``, except that: - - - The ``cls`` keyword is not supported. - - The ``encoding`` keyword is ignored; Unicode strings are always - written. - - By default, object keys that are legal identifiers are not quoted; - if you pass ``quote_keys=True``, they will be. - - By default, if lists and objects span multiple lines of output (i.e., - when ``indent`` >=0), the last item will have a trailing comma - after it. If you pass ``trailing_commas=False``, it will not. - - If you use a number, a boolean, or ``None`` as a key value in a dict, - it will be converted to the corresponding JSON string value, e.g. - "1", "true", or "null". By default, ``dump()`` will match the `json` - modules behavior and produce malformed JSON if you mix keys of - different types that have the same converted value; e.g., - ``{1: "foo", "1": "bar"}`` produces '{"1": "foo", "1": "bar"}', an - object with duplicated keys. If you pass - ``allow_duplicate_keys=False``, an exception will be raised instead. - - If `quote_keys` is true, then keys of objects will be enclosed in - quotes, as in regular JSON. Otherwise, keys will not be enclosed in - quotes unless they contain whitespace. - - If `trailing_commas` is false, then commas will not be inserted after - the final elements of objects and arrays, as in regular JSON. - Otherwise, such commas will be inserted. - - If `allow_duplicate_keys` is false, then only the last entry with a - given key will be written. Otherwise, all entries with the same key - will be written. + Supports the same arguments as ``dumps()``, below. Calling ``dump(obj, fp, quote_keys=True, trailing_commas=False, \ allow_duplicate_keys=True)`` should produce exactly the same output as ``json.dump(obj, fp).`` """ - del kwargs fp.write( dumps( obj=obj, @@ -254,6 +267,8 @@ def dump( quote_keys=quote_keys, trailing_commas=trailing_commas, allow_duplicate_keys=allow_duplicate_keys, + quote_style=quote_style, + **kw, ) ) @@ -265,7 +280,7 @@ def dumps( ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, - cls: None = None, + cls: Optional['JSON5Encoder'] = None, indent: Optional[Union[int, str]] = None, separators: Optional[Tuple[str, str]] = None, default: Optional[Callable[[Any], Any]] = None, @@ -273,520 +288,559 @@ def dumps( quote_keys: bool = False, trailing_commas: bool = True, allow_duplicate_keys: bool = True, - **kwargs, + quote_style: QuoteStyle = QuoteStyle.ALWAYS_DOUBLE, + **kw, ): """Serialize ``obj`` to a JSON5-formatted string. Supports the same arguments as ``json.dumps()``, except that: - - The ``cls`` keyword is not supported. - - The ``encoding`` keyword is ignored; Unicode strings are always - written. - - By default, object keys that are legal identifiers are not quoted; - if you pass ``quote_keys=True``, they will be. + - The ``encoding`` keyword is ignored; Unicode strings are always written. + - By default, object keys that are legal identifiers are not quoted; if you + pass ``quote_keys=True``, they will be. - By default, if lists and objects span multiple lines of output (i.e., - when ``indent`` >=0), the last item will have a trailing comma - after it. If you pass ``trailing_commas=False``, it will not. - - If you use a number, a boolean, or ``None`` as a key value in a dict, - it will be converted to the corresponding JSON string value, e.g. - "1", "true", or "null". By default, ``dump()`` will match the `json` - modules behavior and produce malformed JSON if you mix keys of - different types that have the same converted value; e.g., - ``{1: "foo", "1": "bar"}`` produces '{"1": "foo", "1": "bar"}', an - object with duplicated keys. If you pass - ``allow_duplicate_keys=False``, an exception will be raised instead. - - If `quote_keys` is true, then keys of objects will be enclosed - in quotes, as in regular JSON. Otheriwse, keys will not be enclosed - in quotes unless they contain whitespace. - - If `trailing_commas` is false, then commas will not be inserted after - the final elements of objects and arrays, as in regular JSON. - Otherwise, such commas will be inserted. - - If `allow_duplicate_keys` is false, then only the last entry with a - given key will be written. Otherwise, all entries with the same key - will be written. - - Calling ``dumps(obj, quote_keys=True, trailing_commas=False, \ - allow_duplicate_keys=True)`` + when ``indent`` >=0), the last item will have a trailing comma after it. + If you pass ``trailing_commas=False``, it will not. + - If you use a number, a boolean, or ``None`` as a key value in a dict, it + will be converted to the corresponding JSON string value, e.g. "1", + "true", or "null". By default, ``dump()`` will match the `json` modules + behavior and produce malformed JSON if you mix keys of different types + that have the same converted value; e.g., ``{1: "foo", "1": "bar"}`` + produces '{"1": "foo", "1": "bar"}', an object with duplicated keys. If + you pass ``allow_duplicate_keys=False``, an exception will be raised + instead. + - If `quote_keys` is true, then keys of objects will be enclosed in quotes, + as in regular JSON. Otheriwse, keys will not be enclosed in quotes unless + they contain whitespace. + - If `trailing_commas` is false, then commas will not be inserted after the + final elements of objects and arrays, as in regular JSON. Otherwise, + such commas will be inserted. + - If `allow_duplicate_keys` is false, then only the last entry with a given + key will be written. Otherwise, all entries with the same key will be + written. + - `quote_style` controls how strings are encoded. See the documentation + for the `QuoteStyle` class, above, for how this is used. + + *Note*: Strings that are being used as unquoted keys are not affected + by this parameter and remain unquoted. + + *`quote_style` was added in version 0.10.0*. + + Other keyword arguments are allowed and will be passed to the + encoder so custom encoders can get them, but otherwise they will + be ignored in an attempt to provide some amount of forward-compatibility. + + *Note:* the standard JSON module explicitly calls `int.__repr(obj)__` + and `float.__repr(obj)__` to encode ints and floats, thereby bypassing + any custom representations you might have for objects that are subclasses + of ints and floats, and, for compatibility, JSON5 does the same thing. + To override this behavior, create a subclass of JSON5Encoder + that overrides `encode()` and handles your custom representation. + + For example: + + ``` + >>> import json5 + ... from typing import Any, Set + ... + ... class Hex(int): + ... def __repr__(self): + ... return hex(self) + ... + ... class CustomEncoder(json5.JSON5Encoder): + ... def encode( + ... self, obj: Any, seen: Set, level: int, *, as_key: bool + ... ) -> str: + ... if isinstance(obj, Hex): + ... return repr(obj) + ... return super().encode(obj, seen, level, as_key=as_key) + ... + ... print(json5.dumps([20, Hex(20)], cls=CustomEncoder)) + ... + [20, 0x14] + >>> + ``` + + *Note:* calling ``dumps(obj, quote_keys=True, trailing_commas=False, \ + allow_duplicate_keys=True)`` should produce exactly the same output as ``json.dumps(obj).`` """ - assert kwargs.get('cls', None) is None, 'Custom encoders are not supported' - del cls + # TODO: Without these pragmas, mypy will complain with: + # error: Incompatible types in assignment (expression has type + # "JSON5Encoder | type[JSON5Encoder]", variable has type + # "JSON5Encoder | None") [assignment] + # error: "JSON5Encoder" not callable [operator] + # error: "None" not callable [misc] + # As best I can tell, I think these are bugs in mypy's type inference. + # I should either file bugs against mypy or find some way to not need + # these pragmas and the assert. + + cls = cls or JSON5Encoder # type: ignore[assignment] + assert cls is not None + enc = cls( + skipkeys=skipkeys, + ensure_ascii=ensure_ascii, + check_circular=check_circular, + allow_nan=allow_nan, + indent=indent, + separators=separators, + default=default, + sort_keys=sort_keys, + quote_keys=quote_keys, + trailing_commas=trailing_commas, + allow_duplicate_keys=allow_duplicate_keys, + quote_style=quote_style, + **kw, + ) # type: ignore[operator] + return enc.encode(obj, seen=set(), level=0, as_key=False) + + +class JSON5Encoder: + def __init__( + self, + *, + skipkeys: bool = False, + ensure_ascii: bool = True, + check_circular: bool = True, + allow_nan: bool = True, + indent: Optional[Union[int, str]] = None, + separators: Optional[Tuple[str, str]] = None, + default: Optional[Callable[[Any], Any]] = None, + sort_keys: bool = False, + quote_keys: bool = False, + trailing_commas: bool = True, + allow_duplicate_keys: bool = True, + quote_style: QuoteStyle = QuoteStyle.ALWAYS_DOUBLE, + **kw, + ): + """Provides a class that may be overridden to customize the behavior + of `dumps()`. The keyword args are the same as for that function. + *Added in version 0.10.0""" + # Ignore unrecognized keyword arguments in the hope of providing + # some level of backwards- and forwards-compatibility. + del kw + + self.skipkeys = skipkeys + self.ensure_ascii = ensure_ascii + self.check_circular = check_circular + self.allow_nan = allow_nan + self.indent = indent + self.separators = separators + if separators is None: + separators = (', ', ': ') if indent is None else (',', ': ') + self.item_separator, self.kv_separator = separators + self.default_fn = default or _raise_type_error + self.sort_keys = sort_keys + self.quote_keys = quote_keys + self.trailing_commas = trailing_commas + self.allow_duplicate_keys = allow_duplicate_keys + self.quote_style = quote_style + + def default(self, obj: Any) -> Any: + """Provides a last-ditch option to encode a value that the encoder + doesn't otherwise recognize, by converting `obj` to a value that + *can* (and will) be serialized by the other methods in the class. + + Note: this must not return a serialized value (i.e., string) + directly, as that'll result in a doubly-encoded value.""" + return self.default_fn(obj) + + def encode( + self, + obj: Any, + seen: Set, + level: int, + *, + as_key: bool, + ) -> str: + """Returns an JSON5-encoded version of an arbitrary object. This can + be used to provide customized serialization of objects. Overridden + methods of this class should handle their custom objects and then + fall back to super.encode() if they've been passed a normal object. + + `seen` is used for duplicate object tracking when `check_circular` + is True. + + `level` represents the current indentation level, which increases + by one for each recursive invocation of encode (i.e., whenever + we're encoding the values of a dict or a list). + + May raise `TypeError` if the object is the wrong type to be + encoded (i.e., your custom routine can't handle it either), and + `ValueError` if there's something wrong with the value, e.g. + a float value of NaN when `allow_nan` is false. + + If `as_key` is true, the return value should be a double-quoted string + representation of the object, unless obj is a string that can be an + identifier (and quote_keys is false and obj isn't a reserved word). + If the object should not be used as a key, `TypeError` should be + raised; that allows the base implementation to implement `skipkeys` + properly. + """ + seen = seen or set() + s = self._encode_basic_type(obj, as_key=as_key) + if s is not None: + return s - if separators is None: - if indent is None: - separators = (', ', ': ') - else: - separators = (',', ': ') + if as_key: + raise TypeError(f'Invalid key f{obj}') + return self._encode_non_basic_type(obj, seen, level) - default = default or _raise_type_error + def _encode_basic_type(self, obj: Any, *, as_key: bool) -> Optional[str]: + """Returns None if the object is not a basic type.""" - if check_circular: - seen: Optional[Set[int]] = set() - else: - seen = None - - level = 1 - is_key = False - - _, v = _dumps( - obj, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - is_key, - ) - return v - - -def _dumps( - obj, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen: Optional[Set[int]], - level: int, - is_key: bool, -): - # pylint: disable=too-many-statements - if obj is True: - s = 'true' - elif obj is False: - s = 'false' - elif obj is None: - s = 'null' - elif obj == float('inf'): - if allow_nan: + if isinstance(obj, str): + return self._encode_str(obj, as_key=as_key) + + # Check for True/False before ints because True and False are + # also considered ints and so would be represented as 1 and 0 + # if we did ints first. + if obj is True: + return '"true"' if as_key else 'true' + if obj is False: + return '"false"' if as_key else 'false' + if obj is None: + return '"null"' if as_key else 'null' + + if isinstance(obj, int): + return self._encode_int(obj, as_key=as_key) + + if isinstance(obj, float): + return self._encode_float(obj, as_key=as_key) + + return None + + def _encode_int(self, obj: int, *, as_key: bool) -> str: + s = int.__repr__(obj) + return f'"{s}"' if as_key else s + + def _encode_float(self, obj: float, *, as_key: bool) -> str: + if obj == float('inf'): + allowed = self.allow_nan s = 'Infinity' - else: - raise ValueError() - elif obj == float('-inf'): - if allow_nan: + elif obj == float('-inf'): + allowed = self.allow_nan s = '-Infinity' - else: - raise ValueError() - elif isinstance(obj, float) and math.isnan(obj): - if allow_nan: + elif math.isnan(obj): + allowed = self.allow_nan s = 'NaN' else: - raise ValueError() - elif isinstance(obj, str): + allowed = True + s = float.__repr__(obj) + + if not allowed: + raise ValueError('Illegal JSON5 value: f{obj}') + return f'"{s}"' if as_key else s + + def _encode_str(self, obj: str, *, as_key: bool) -> str: if ( - is_key - and _is_ident(obj) - and not quote_keys - and not _is_reserved_word(obj) + as_key + and self.is_identifier(obj) + and not self.quote_keys + and not self.is_reserved_word(obj) ): - return True, obj - return True, _dump_str(obj, ensure_ascii) - elif isinstance(obj, int): - # Subclasses of `int` and `float` may have custom - # __repr__ or __str__ methods, but the `JSON` library - # ignores them in order to ensure that the representation - # are just bare numbers. In order to match JSON's behavior - # we call the methods of the `float` and `int` class directly. - s = int.__repr__(obj) - elif isinstance(obj, float): - # See comment above for int - s = float.__repr__(obj) - else: - s = None - - if is_key: - if s is not None: - return True, f'"{s}"' - if skipkeys: - return False, None - raise TypeError(f'invalid key {repr(obj)}') - - if s is not None: - return True, s - - if indent is not None: - end_str = '' - if trailing_commas: - end_str = ',' - if isinstance(indent, int): - if indent > 0: - indent_str = '\n' + ' ' * indent * level - end_str += '\n' + ' ' * indent * (level - 1) + return obj + + return self._encode_quoted_str(obj, self.quote_style) + + def _encode_quoted_str(self, obj: str, quote_style: QuoteStyle) -> str: + """Returns a quoted string with a minimal number of escaped quotes.""" + ret = [] + double_quotes_seen = 0 + single_quotes_seen = 0 + sq = "'" + dq = '"' + for ch in obj: + if ch == dq: + # At first we will guess at which quotes to escape. If + # we guess wrong, we reencode the string below. + double_quotes_seen += 1 + if quote_style in ( + QuoteStyle.ALWAYS_DOUBLE, + QuoteStyle.PREFER_DOUBLE, + ): + encoded_ch = self._escape_ch(dq) + else: + encoded_ch = dq + elif ch == sq: + single_quotes_seen += 1 + if quote_style in ( + QuoteStyle.ALWAYS_SINGLE, + QuoteStyle.PREFER_SINGLE, + ): + encoded_ch = self._escape_ch(sq) + else: + encoded_ch = sq + elif ch == '\\': + encoded_ch = self._escape_ch(ch) else: - indent_str = '\n' - end_str += '\n' + o = ord(ch) + if o < 32: + encoded_ch = self._escape_ch(ch) + elif o < 128: + encoded_ch = ch + elif not self.ensure_ascii and ch not in ('\u2028', '\u2029'): + encoded_ch = ch + else: + encoded_ch = self._escape_ch(ch) + ret.append(encoded_ch) + + # We may have guessed wrong and need to reencode the string. + if ( + double_quotes_seen > single_quotes_seen + and quote_style == QuoteStyle.PREFER_DOUBLE + ): + return self._encode_quoted_str(obj, QuoteStyle.ALWAYS_SINGLE) + if ( + single_quotes_seen > double_quotes_seen + and quote_style == QuoteStyle.PREFER_SINGLE + ): + return self._encode_quoted_str(obj, QuoteStyle.ALWAYS_DOUBLE) + + if quote_style in (QuoteStyle.ALWAYS_DOUBLE, QuoteStyle.PREFER_DOUBLE): + return '"' + ''.join(ret) + '"' + return "'" + ''.join(ret) + "'" + + def _escape_ch(self, ch: str) -> str: + """Returns the backslash-escaped representation of the char.""" + if ch == '\\': + return '\\\\' + if ch == "'": + return r'\'' + if ch == '"': + return r'\"' + if ch == '\n': + return r'\n' + if ch == '\r': + return r'\r' + if ch == '\t': + return r'\t' + if ch == '\b': + return r'\b' + if ch == '\f': + return r'\f' + if ch == '\v': + return r'\v' + if ch == '\0': + return r'\0' + + o = ord(ch) + if o < 65536: + return rf'\u{o:04x}' + + val = o - 0x10000 + high = 0xD800 + (val >> 10) + low = 0xDC00 + (val & 0x3FF) + return rf'\u{high:04x}\u{low:04x}' + + def _encode_non_basic_type(self, obj, seen: Set, level: int) -> str: + # Basic types can't be recursive so we only check for circularity + # on non-basic types. If for some reason the caller was using a + # subclass of a basic type and wanted to check circularity on it, + # it'd have to do so directly in a subclass of JSON5Encoder. + if self.check_circular: + i = id(obj) + if i in seen: + raise ValueError('Circular reference detected.') + seen.add(i) + + # Ideally we'd use collections.abc.Mapping and collections.abc.Sequence + # here, but for backwards-compatibility with potential old callers, + # we only check for the two attributes we need in each case. + if hasattr(obj, 'keys') and hasattr(obj, '__getitem__'): + s = self._encode_dict(obj, seen, level + 1) + elif hasattr(obj, '__getitem__') and hasattr(obj, '__iter__'): + s = self._encode_array(obj, seen, level + 1) else: - indent_str = '\n' + indent * level - end_str += '\n' + indent * (level - 1) - else: - indent_str = '' - end_str = '' - - item_sep, kv_sep = separators - item_sep += indent_str - - if seen is not None: - i = id(obj) - if i in seen: - raise ValueError('Circular reference detected.') - seen.add(i) - - # Ideally we'd use collections.abc.Mapping and collections.abc.Sequence - # here, but for backwards-compatibility with potential old callers, - # we only check for the two attributes we need in each case. - if hasattr(obj, 'keys') and hasattr(obj, '__getitem__'): - s = _dump_dict( - obj, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level + 1, - item_sep, - kv_sep, - indent_str, - end_str, - ) - elif hasattr(obj, '__getitem__') and hasattr(obj, '__iter__'): - s = _dump_array( - obj, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level + 1, - item_sep, - indent_str, - end_str, - ) - else: - s = _dumps( - default(obj), - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - is_key, - )[1] - - if seen is not None: - seen.remove(i) - return False, s - - -def _dump_dict( - obj, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - item_sep, - kv_sep, - indent_str, - end_str, -): - if not obj: - return '{}' + s = self.encode(self.default(obj), seen, level + 1, as_key=False) + assert s is not None - if sort_keys: - keys = sorted(obj.keys()) - else: - keys = obj.keys() - - s = '{' + indent_str - - num_items_added = 0 - new_keys = set() - for key in keys: - valid_key, key_str = _dumps( - key, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - is_key=True, - ) + if self.check_circular: + seen.remove(i) + return s - if skipkeys and not valid_key: - continue - - if not allow_duplicate_keys: - if key_str in new_keys: - raise ValueError(f'duplicate key {repr(key)}') - new_keys.add(key_str) - - if num_items_added: - s += item_sep - - s += ( - key_str - + kv_sep - + _dumps( - obj[key], - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - is_key=False, - )[1] - ) - num_items_added += 1 - - s += end_str + '}' - return s - - -def _dump_array( - obj, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - item_sep, - indent_str, - end_str, -): - if not obj: - return '[]' - return ( - '[' - + indent_str - + item_sep.join( - [ - _dumps( - el, - skipkeys, - ensure_ascii, - check_circular, - allow_nan, - indent, - separators, - default, - sort_keys, - quote_keys, - trailing_commas, - allow_duplicate_keys, - seen, - level, - False, - )[1] - for el in obj - ] - ) - + end_str - + ']' - ) + def _encode_dict(self, obj: Any, seen: set, level: int) -> str: + if not obj: + return '{}' + indent_str, end_str = self._spacers(level) + item_sep = self.item_separator + indent_str + kv_sep = self.kv_separator -def _dump_str(obj, ensure_ascii): - ret = ['"'] - for ch in obj: - if ch == '\\': - ret.append('\\\\') - elif ch == '"': - ret.append('\\"') - elif ch == '\u2028': - ret.append('\\u2028') - elif ch == '\u2029': - ret.append('\\u2029') - elif ch == '\n': - ret.append('\\n') - elif ch == '\r': - ret.append('\\r') - elif ch == '\b': - ret.append('\\b') - elif ch == '\f': - ret.append('\\f') - elif ch == '\t': - ret.append('\\t') - elif ch == '\v': - ret.append('\\v') - elif ch == '\0': - ret.append('\\0') - elif not ensure_ascii: - ret.append(ch) + if self.sort_keys: + keys = sorted(obj.keys()) else: - o = ord(ch) - if 32 <= o < 128: - ret.append(ch) - elif o < 65536: - ret.append(f'\\u{o:04x}') + keys = obj.keys() + + s = '{' + indent_str + + first_key = True + new_keys = set() + for key in keys: + try: + key_str = self.encode(key, seen, level, as_key=True) + except TypeError: + if self.skipkeys: + continue + raise + + if not self.allow_duplicate_keys: + if key_str in new_keys: + raise ValueError(f'duplicate key {repr(key)}') + new_keys.add(key_str) + + if first_key: + first_key = False else: - val = o - 0x10000 - high = 0xD800 + (val >> 10) - low = 0xDC00 + (val & 0x3FF) - ret.append(f'\\u{high:04x}\\u{low:04x}') - return ''.join(ret) + '"' + s += item_sep + val_str = self.encode(obj[key], seen, level, as_key=False) + s += key_str + kv_sep + val_str -def _is_ident(k): - if not k or not _is_id_start(k[0]) and k[0] not in ('$', '_'): - return False - for ch in k[1:]: - if not _is_id_continue(ch) and ch not in ('$', '_'): - return False - return True - - -def _is_id_start(ch): - return unicodedata.category(ch) in ( - 'Lu', - 'Ll', - 'Li', - 'Lt', - 'Lm', - 'Lo', - 'Nl', - ) + s += end_str + '}' + return s + def _encode_array(self, obj: Any, seen: Set, level: int) -> str: + if not obj: + return '[]' -def _is_id_continue(ch): - return unicodedata.category(ch) in ( - 'Lu', - 'Ll', - 'Li', - 'Lt', - 'Lm', - 'Lo', - 'Nl', - 'Nd', - 'Mn', - 'Mc', - 'Pc', - ) + indent_str, end_str = self._spacers(level) + item_sep = self.item_separator + indent_str + return ( + '[' + + indent_str + + item_sep.join( + self.encode(el, seen, level, as_key=False) for el in obj + ) + + end_str + + ']' + ) + + def _spacers(self, level: int) -> Tuple[str, str]: + if self.indent is not None: + end_str = '' + if self.trailing_commas: + end_str = ',' + if isinstance(self.indent, int): + if self.indent > 0: + indent_str = '\n' + ' ' * self.indent * level + end_str += '\n' + ' ' * self.indent * (level - 1) + else: + indent_str = '\n' + end_str += '\n' + else: + indent_str = '\n' + self.indent * level + end_str += '\n' + self.indent * (level - 1) + else: + indent_str = '' + end_str = '' + return indent_str, end_str + + def is_identifier(self, key: str) -> bool: + """Returns whether the string could be used as a legal + EcmaScript/JavaScript identifier. + + There should normally be no reason to override this, unless + the definition of identifiers change in later versions of the + JSON5 spec and this implementation hasn't been updated to handle + the changes yet.""" + if ( + not key + or not self._is_id_start(key[0]) + and key[0] not in ('$', '_') + ): + return False + for ch in key[1:]: + if not self._is_id_continue(ch) and ch not in ('$', '_'): + return False + return True + def _is_id_start(self, ch: str) -> bool: + return unicodedata.category(ch) in ( + 'Lu', + 'Ll', + 'Li', + 'Lt', + 'Lm', + 'Lo', + 'Nl', + ) -_reserved_word_re = None - - -def _is_reserved_word(k): - global _reserved_word_re - - if _reserved_word_re is None: - # List taken from section 7.6.1 of ECMA-262. - _reserved_word_re = re.compile( - '(' - + '|'.join( - [ - 'break', - 'case', - 'catch', - 'class', - 'const', - 'continue', - 'debugger', - 'default', - 'delete', - 'do', - 'else', - 'enum', - 'export', - 'extends', - 'false', - 'finally', - 'for', - 'function', - 'if', - 'import', - 'in', - 'instanceof', - 'new', - 'null', - 'return', - 'super', - 'switch', - 'this', - 'throw', - 'true', - 'try', - 'typeof', - 'var', - 'void', - 'while', - 'with', - ] - ) - + ')$' + def _is_id_continue(self, ch: str) -> bool: + return unicodedata.category(ch) in ( + 'Lu', + 'Ll', + 'Li', + 'Lt', + 'Lm', + 'Lo', + 'Nl', + 'Nd', + 'Mn', + 'Mc', + 'Pc', ) - return _reserved_word_re.match(k) is not None + + def is_reserved_word(self, key: str) -> bool: + """Returns whether the key is a reserved word. + + There should normally be no need to override this, unless there + have been reserved words added in later versions of the JSON5 + spec and this implementation has not yet been updated to handle + the changes yet.""" + global _reserved_word_re + if _reserved_word_re is None: + # List taken from section 7.6.1 of ECMA-262, version 5.1. + # https://262.ecma-international.org/5.1/#sec-7.6.1. + # This includes currently reserved words, words reserved + # for future use (both as of 5.1), null, true, and false. + _reserved_word_re = re.compile( + '(' + + '|'.join( + [ + 'break', + 'case', + 'catch', + 'class', + 'const', + 'continue', + 'debugger', + 'default', + 'delete', + 'do', + 'else', + 'enum', + 'export', + 'extends', + 'false', + 'finally', + 'for', + 'function', + 'if', + 'import', + 'in', + 'instanceof', + 'new', + 'null', + 'return', + 'super', + 'switch', + 'this', + 'throw', + 'true', + 'try', + 'typeof', + 'var', + 'void', + 'while', + 'with', + ] + ) + + ')$' + ) + return _reserved_word_re.match(key) is not None -def _raise_type_error(obj): +def _raise_type_error(obj) -> Any: raise TypeError(f'{repr(obj)} is not JSON5 serializable') diff --git a/json5/parser.py b/json5/parser.py index 581d4a7..952e74e 100644 --- a/json5/parser.py +++ b/json5/parser.py @@ -1,4 +1,4 @@ -# Generated by glop version 0.8.2 +# Generated by glop version 0.8.3 # https://github.com/dpranke/glop # `glop -o json5/parser.py --no-main --no-memoize -c json5/json5.g` @@ -95,7 +95,7 @@ def _plus(self, rule): def _star(self, rule, vs=None): vs = vs or [] - while not self.failed: + while True: p = self.pos rule() if self.failed: diff --git a/json5/tool.py b/json5/tool.py index 332fa05..c282964 100644 --- a/json5/tool.py +++ b/json5/tool.py @@ -16,33 +16,109 @@ Usage: - $ echo '{foo:"bar"}' | python -m json5.tool + $ echo '{foo:"bar"}' | python -m json5 { foo: 'bar', } - $ echo '{foo:"bar"}' | python -m json5.tool --as-json + $ echo '{foo:"bar"}' | python -m json5 --as-json { "foo": "bar" } """ +import argparse import sys -from . import arg_parser -from . import lib -from .host import Host -from .version import __version__ +import json5 +from json5.host import Host +from json5.version import __version__ + +QUOTE_STYLES = {q.value: q for q in json5.QuoteStyle} def main(argv=None, host=None): host = host or Host() - parser = arg_parser.ArgumentParser(host, prog='json5', desc=__doc__) + args = _parse_args(host, argv) + + if args.version: + host.print(__version__) + return 0 + + if args.cmd: + inp = args.cmd + elif args.file == '-': + inp = host.stdin.read() + else: + inp = host.read_text_file(args.file) + + if args.indent == 'None': + args.indent = None + else: + try: + args.indent = int(args.indent) + except ValueError: + pass + + if args.as_json: + args.quote_keys = True + args.trailing_commas = False + args.quote_style = json5.QuoteStyle.ALWAYS_DOUBLE.value + + obj = json5.loads(inp, strict=args.strict) + s = json5.dumps( + obj, + indent=args.indent, + quote_keys=args.quote_keys, + trailing_commas=args.trailing_commas, + quote_style=QUOTE_STYLES[args.quote_style], + ) + host.print(s) + return 0 + + +class _HostedArgumentParser(argparse.ArgumentParser): + """An argument parser that plays nicely w/ host objects.""" + + def __init__(self, host, **kwargs): + self.host = host + super().__init__(**kwargs) + + def exit(self, status=0, message=None): + if message: + self._print_message(message, self.host.stderr) + sys.exit(status) + + def error(self, message): + self.host.print(f'usage: {self.usage}', end='', file=self.host.stderr) + self.host.print(' -h/--help for help\n', file=self.host.stderr) + self.exit(2, f'error: {message}\n') + + def print_help(self, file=None): + self.host.print(self.format_help(), file=file) + + +def _parse_args(host, argv): + usage = 'json5 [options] [FILE]\n' + + parser = _HostedArgumentParser( + host, + prog='json5', + usage=usage, + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '-V', + '--version', + action='store_true', + help=f'show JSON5 library version ({__version__})', + ) parser.add_argument( '-c', metavar='STR', dest='cmd', - help='inline json5 string to read instead of ' 'reading from a file', + help='inline json5 string to read instead of reading from a file', ) parser.add_argument( '--as-json', @@ -50,13 +126,13 @@ def main(argv=None, host=None): action='store_const', const=True, default=False, - help='output as JSON ' '(same as --quote-keys --no-trailing-commas)', + help='output as JSON (same as --quote-keys --no-trailing-commas)', ) parser.add_argument( '--indent', dest='indent', default=4, - help='amount to indent each line ' '(default is 4 spaces)', + help='amount to indent each line (default is 4 spaces)', ) parser.add_argument( '--quote-keys', @@ -89,13 +165,22 @@ def main(argv=None, host=None): '--strict', action='store_true', default=True, - help='Do not allow control characters (\x00-\x1f) in strings (default)', + help='Do not allow control characters (\\x00-\\x1f) in strings ' + '(default)', ) parser.add_argument( '--no-strict', dest='strict', action='store_false', - help='Allow control characters (\x00-\x1f) in strings', + help='Allow control characters (\\x00-\\x1f) in strings', + ) + parser.add_argument( + '--quote-style', + action='store', + default='always_double', + choices=QUOTE_STYLES.keys(), + help='Controls how strings are encoded. By default they are always ' + 'double-quoted ("always_double")', ) parser.add_argument( 'file', @@ -106,43 +191,7 @@ def main(argv=None, host=None): 'not specified or "-", will read from stdin ' 'instead', ) - args = parser.parse_args(argv) - - if parser.exit_status is not None: - return parser.exit_status - - if args.version: - host.print_(__version__) - return 0 - - if args.cmd: - inp = args.cmd - elif args.file == '-': - inp = host.stdin.read() - else: - inp = host.read_text_file(args.file) - - if args.indent == 'None': - args.indent = None - else: - try: - args.indent = int(args.indent) - except ValueError: - pass - - if args.as_json: - args.quote_keys = True - args.trailing_commas = False - - obj = lib.loads(inp, strict=args.strict) - s = lib.dumps( - obj, - indent=args.indent, - quote_keys=args.quote_keys, - trailing_commas=args.trailing_commas, - ) - host.print_(s) - return 0 + return parser.parse_args(argv) if __name__ == '__main__': # pragma: no cover diff --git a/json5/version.py b/json5/version.py index ebf2568..db9bf44 100644 --- a/json5/version.py +++ b/json5/version.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '0.9.29.dev0' +__version__ = '0.10.0' # For backward-compatibility with earlier versions of json5: VERSION = __version__ diff --git a/pylintrc b/pylintrc index 94776e9..7133aaa 100644 --- a/pylintrc +++ b/pylintrc @@ -26,6 +26,7 @@ persistent=yes disable= broad-except, + fixme, global-statement, locally-disabled, missing-docstring, diff --git a/tests/host_fake.py b/tests/host_fake.py index 8012619..539022a 100644 --- a/tests/host_fake.py +++ b/tests/host_fake.py @@ -84,16 +84,18 @@ def maybe_mkdir(self, *comps): # pragma: no cover def mkdtemp(self, suffix='', prefix='tmp', dir=None, **_kwargs): if dir is None: dir = self.sep + '__im_tmp' + else: # pragma: no cover + pass curno = self.current_tmpno self.current_tmpno += 1 self.last_tmpdir = self.join(dir, f'{prefix}_{curno}_{suffix}') self.dirs.add(self.last_tmpdir) return self.last_tmpdir - def print_(self, msg='', end='\n', stream=None): - stream = stream or self.stdout - stream.write(msg + end) - stream.flush() + def print(self, msg='', end='\n', file=None): + file = file or self.stdout + file.write(msg + end) + file.flush() def read_text_file(self, *comps): return self._read(comps) @@ -111,6 +113,8 @@ def rmtree(self, *comps): for f in self.files: if f.startswith(path): self.remove(f) + else: # pragma: no cover + pass self.dirs.remove(path) def write_text_file(self, path, contents): diff --git a/tests/host_test.py b/tests/host_test.py index 97ff218..5fe30f1 100644 --- a/tests/host_test.py +++ b/tests/host_test.py @@ -39,7 +39,7 @@ def test_directory_and_file_operations(self): def test_print(self): s = io.StringIO() h = Host() - h.print_('hello, world', stream=s) + h.print('hello, world', file=s) self.assertEqual('hello, world\n', s.getvalue()) diff --git a/tests/lib_test.py b/tests/lib_test.py index 1246ae5..6465649 100644 --- a/tests/lib_test.py +++ b/tests/lib_test.py @@ -25,7 +25,7 @@ class TestLoads(unittest.TestCase): maxDiff = None def check(self, s, obj, strict=True): - self.assertEqual(json5.loads(s, strict=strict), obj) + self.assertEqual(obj, json5.loads(s, strict=strict)) def check_fail(self, s, err=None): try: @@ -51,9 +51,6 @@ def test_bools(self): self.check('true', True) self.check('false', False) - def test_cls_is_not_supported(self): - self.assertRaises(AssertionError, json5.loads, '1', cls=lambda x: x) - def test_duplicate_keys_should_be_allowed(self): self.assertEqual( json5.loads('{foo: 1, foo: 2}', allow_duplicate_keys=True), @@ -334,8 +331,8 @@ def test_basic(self): class TestDumps(unittest.TestCase): maxDiff = None - def check(self, obj, s): - self.assertEqual(json5.dumps(obj), s) + def check(self, obj, s, **kwargs): + self.assertEqual(s, json5.dumps(obj, **kwargs)) def test_allow_duplicate_keys(self): self.assertIn( @@ -362,18 +359,25 @@ def test_bools(self): self.check(False, 'false') def test_check_circular(self): + # This tests that a non-cyclic object works w/ either flag value. + obj = [1, 2, 3] + self.check(obj, '[1, 2, 3]') # testing the default + self.check(obj, '[1, 2, 3]', check_circular=True) + self.check(obj, '[1, 2, 3]', check_circular=False) + # This tests a trivial cycle. obj = [1, 2, 3] obj[2] = obj self.assertRaises(ValueError, json5.dumps, obj) - # This checks that json5 doesn't raise an error. However, + # This checks that json5 doesn't raise an error with + # check_circular=false and a cycle. However, # the underlying Python implementation likely will. try: json5.dumps(obj, check_circular=False) self.fail() # pragma: no cover - except Exception as e: - self.assertNotIn(str(e), 'Circular reference detected') + except RecursionError: + pass # This checks that repeated but non-circular references # are okay. @@ -522,6 +526,13 @@ def test_numbers(self): def test_null(self): self.check(None, 'null') + def test_separators(self): + # Check that custom separators work; these separators add an + # extra space. + self.check( + [{'foo': 1}, 2], '[{foo: 1}, 2]', separators=(', ', ': ') + ) + def test_objects(self): self.check({'foo': 1}, '{foo: 1}') self.check({'foo bar': 1}, '{"foo bar": 1}') @@ -549,19 +560,189 @@ def test_quote_keys(self): json5.dumps({'foo': 1}, quote_keys=True), '{"foo": 1}' ) - def test_strings(self): + def test_strings_containing_backslashes_and_quotes(self): + # Understanding how things are escaped in test of JSON5 strings + # can be tricky. + + # Normal Python escaping means that the following asserts are true: + self.assertEqual('\\z', '\\z') + self.assertEqual('\\z', r'\z') + self.assertEqual('\\z', r'\z') + + # But, in Python, escaping quotes in a raw string is tricky, because + # the escape is left in the output. The results of this are: + + # (1) You cannot use a raw string to match a value ending in a + # an odd number of backslashes: the first N-1 backslashes would + # be matched by the same number of backslashes in the raw string, + # leaving a single backslash followed by a quote. The quote + # would then be considered escaped, leaving the string unterminated. + # Ending in an even number of backslashes is fine: + self.assertEqual(len(r'\\'), 2) + self.assertEqual(r'\\', '\\\\') + + # (2) You cannot use a raw string to represent a value that contains + # the same kind of quote you're using to enclose the string, unless the + # value actually contains an odd number of backslashes immediately + # preceding the quote: + self.assertEqual(len(r'\''), 2) + self.assertEqual(r'\'', "\\'") + self.assertEqual(r'\'', "\\'") + self.assertEqual(r'\'', r'\'') + + # Now, in JSON5, if the value doesn't contain backslashes, you can + # use normal quoting as you would in Python, but you can't use + # raw strings, since the raw strings would require the values to + # have backslashes in them: self.check("'single'", '"\'single\'"') - self.check('"double"', '"\\"double\\""') - self.check( - "'single \\' and double \"'", '"\'single \\\\\' and double \\"\'"' - ) + self.check("'single'", '"\'single\'"') + + # In order to represent a backslash in the value you also need to + # escape it in the JSON string: a string containing a single backslash + # is represented by "\\". So, in order to match that single backslash + # via non-raw strings in Python source code, you need to (3) double the + # backslashes (for JSON5) and then double them again, for python source + # code. I.e., you need *4* backslashes in the source code. In many + # cases you can also use single-quoted raw strings (where you have + # to (4) double the number of quotes in the output), but in this + # particular example, you cannot use single-quoted raw strings, + # due to (1). + self.check('\\', '"\\\\"') + self.check('\\', '"\\\\"') + + # You cannot use a double-quoted raw string to represent + # double-quoted JSON5 strings, since the output needs to start with a + # double quote, and you can't represent that in a raw double-quoted + # string due to (2). + + # Here you see the doubling of backslashes in single-quoted + # raw output strings, and the quadrupling in a non-raw string. + self.check(r'\z', r'"\\z"') + self.check(r'\\z', r'"\\\\z"') + self.check(r'\\\z', r'"\\\\\\z"') + self.check(r'\z', '"\\\\z"') + + self.check('"', '"\\""') + self.check('"', '"\\""') + + # Here it's okay to use a raw string for output since the output + # needs to have a single backslash and doesn't end in a single quote. + self.check('"', r'"\""') + + # Here you cannot use raw strings for the output as the output + # would need to have only two backslashes in it. + self.check(r'\'', '"\\\\\'"') + self.check(r'\'', '"\\\\\'"') def test_string_escape_sequences(self): + # self.check(r'\'', '"\\\\\'"') + self.check("'\\'", '"\'\\\\\'"') self.check( '\u2028\u2029\b\t\f\n\r\v\\\0', - '"\\u2028\\u2029\\b\\t\\f\\n\\r\\v\\\\\\0"', + r'"\u2028\u2029\b\t\f\n\r\v\\\0"', ) + def test_string_quote_styles(self): + def checkp(**kwargs): + return lambda obj, s: self.assertEqual( + s, json5.dumps(obj, **kwargs) + ) + + sq = "'" + dq = '"' + + neither = 'a b c' + single_neither = sq + neither + sq + double_neither = dq + neither + dq + + single = "a 'b' c" + single_single = sq + r'a \'b\' c' + sq + double_single = dq + single + dq + + double = 'a "b" c' + single_double = sq + double + sq + double_double = dq + r'a \"b\" c' + dq + + both = 'a \'b\' "c" d' + single_both = sq + r'a \'b\' "c" d' + sq + double_both = dq + r"a 'b' \"c\" d" + dq + + reverse = 'a "b" \'c\' d' + single_reverse = sq + r'a "b" \'c\' d' + sq + double_reverse = dq + r"a \"b\" 'c' d" + dq + + more_single = "a 'b' 'c' \"d\" e" + single_more_single = sq + r'a \'b\' \'c\' "d" e' + sq + double_more_single = dq + r"a 'b' 'c' \"d\" e" + dq + + more_double = 'a "b" "c" \'d\' e' + single_more_double = sq + r'a "b" "c" \'d\' e' + sq + double_more_double = dq + r"a \"b\" \"c\" 'd' e" + dq + + more_single_double_first = "a \"b\" 'c' 'd' e" + single_more_single_double_first = sq + r'a "b" \'c\' \'d\' e' + sq + double_more_single_double_first = dq + r"a \"b\" 'c' 'd' e" + dq + + more_double_single_first = 'a \'b\' "c" "d" e' + single_more_double_single_first = sq + r'a \'b\' "c" "d" e' + sq + double_more_double_single_first = dq + r"a 'b' \"c\" \"d\" e" + dq + + # Default settings (should be ALWAYS_DOUBLE) + c = checkp() + c(neither, double_neither) + c(single, double_single) + c(double, double_double) + c(both, double_both) + c(reverse, double_reverse) + c(more_single, double_more_single) + c(more_double, double_more_double) + c(more_single_double_first, double_more_single_double_first) + c(more_double_single_first, double_more_double_single_first) + + c = checkp(quote_style=json5.QuoteStyle.ALWAYS_DOUBLE) + c(neither, double_neither) + c(single, double_single) + c(double, double_double) + c(both, double_both) + c(reverse, double_reverse) + c(more_single, double_more_single) + c(more_double, double_more_double) + c(more_single_double_first, double_more_single_double_first) + c(more_double_single_first, double_more_double_single_first) + + c = checkp(quote_style=json5.QuoteStyle.ALWAYS_SINGLE) + c(neither, single_neither) + c(single, single_single) + c(double, single_double) + c(both, single_both) + c(reverse, single_reverse) + c(more_single, single_more_single) + c(more_double, single_more_double) + c(more_single_double_first, single_more_single_double_first) + c(more_double_single_first, single_more_double_single_first) + + c = checkp(quote_style=json5.QuoteStyle.PREFER_DOUBLE) + c(neither, double_neither) + c(single, double_single) + c(double, single_double) + c(both, double_both) + c(reverse, double_reverse) + c(more_single, double_more_single) + c(more_double, single_more_double) + c(more_single_double_first, double_more_single_double_first) + c(more_double_single_first, single_more_double_single_first) + + c = checkp(quote_style=json5.QuoteStyle.PREFER_SINGLE) + c(neither, single_neither) + c(single, double_single) + c(double, single_double) + c(both, single_both) + c(reverse, single_reverse) + c(more_single, double_more_single) + c(more_double, single_more_double) + c(more_single_double_first, double_more_single_double_first) + c(more_double_single_first, single_more_double_single_first) + def test_skip_keys(self): od = OrderedDict() od[(1, 2)] = 2 diff --git a/tests/tool_test.py b/tests/tool_test.py index 33f490c..dceb860 100644 --- a/tests/tool_test.py +++ b/tests/tool_test.py @@ -12,157 +12,197 @@ # See the License for the specific language governing permissions and # limitations under the License. +import subprocess +import sys import unittest -import json5 -import json5.tool +from json5 import __version__, VERSION +from json5.host import Host +from json5.tool import main -from .host_fake import FakeHost +from tests.host_fake import FakeHost -class CheckMixin: +class ToolTest(unittest.TestCase): + maxDiff = None + def _write_files(self, host, files): for path, contents in list(files.items()): host.write_text_file(path, contents) - def check_cmd( - self, args, stdin=None, files=None, returncode=None, out=None, err=None + def check( + self, args, stdin=None, files=None, returncode=0, out=None, err=None ): - host = self._host() - orig_wd, tmpdir = None, None - try: + # We can run the tests two ways: as a full out-of-process integration + # test (launching a subprocess and checking stdin/out/err) and as + # a mocked-out in-process pseudo-integration test. + # + # The code coverage between the two is identical (excluding the + # coverage in this file, of course). The full integration tests + # are slower, and running `python -m coverage` won't account for + # the coverage in the subprocess. + # + # For greatest coverage, by default we run the tests both ways. + # TODO: If there was some convention for how to pass arguments from + # a caller of the unittest module to this code, it would be nice + # if we had command line args to toggle the two modes off and on. + # Or, we could also figure out how to get the coverage in the + # subprocess correctly accounted for. + in_proc = True + out_of_proc = True + assert ( + in_proc or out_of_proc + ), 'At least one of in_proc or out_of_proc must be true' + + if in_proc: + fake_host = FakeHost() + orig_wd = fake_host.getcwd() + tmpdir = fake_host.mkdtemp() + fake_host.chdir(tmpdir) + + fake_host.write_text_file('/tmp/foo', '') + + if stdin: + fake_host.stdin.write(stdin) + fake_host.stdin.seek(0) + if files: + self._write_files(fake_host, files) + + try: + mock_ret = main(args, fake_host) + except SystemExit as e: + mock_ret = e.code + + fake_host.rmtree(tmpdir) + fake_host.chdir(orig_wd) + + mock_out = fake_host.stdout.getvalue() + mock_err = fake_host.stderr.getvalue() + + if returncode is not None: + self.assertEqual(returncode, mock_ret) + if out is not None: + self.assertMultiLineEqual(out, mock_out) + if err is not None: + self.assertMultiLineEqual(err, mock_err) + else: # pragma: no cover + pass + + if out_of_proc: + host = Host() orig_wd = host.getcwd() tmpdir = host.mkdtemp() - host.chdir(tmpdir) - if files: - self._write_files(host, files) - rv = self._call(host, args, stdin, returncode, out, err) - actual_ret, actual_out, actual_err = rv - finally: - if tmpdir: + try: + host.chdir(tmpdir) + if files: + self._write_files(host, files) + + args = [sys.executable, '-m', 'json5'] + args + with subprocess.Popen( + args, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + encoding='utf-8', + ) as proc: + actual_out, actual_err = proc.communicate(input=stdin) + actual_ret = proc.returncode + if returncode is not None: + self.assertEqual(returncode, actual_ret) + if out is not None: + self.assertMultiLineEqual(out, actual_out) + if err is not None: + self.assertMultiLineEqual(err, actual_err) + + finally: host.rmtree(tmpdir) - if orig_wd: host.chdir(orig_wd) + else: # pragma: no cover + pass - return actual_ret, actual_out, actual_err - + if in_proc: + return mock_ret, mock_out, mock_err -class UnitTestMixin: - def _host(self): - return FakeHost() - - def _call( - self, host, args, stdin=None, returncode=None, out=None, err=None - ): - if stdin is not None: - host.stdin.write(str(stdin)) - host.stdin.seek(0) - actual_ret = json5.tool.main(args, host) - actual_out = host.stdout.getvalue() - actual_err = host.stderr.getvalue() - if returncode is not None: - self.assertEqual(returncode, actual_ret) - if out is not None: - self.assertEqual(out, actual_out) - if err is not None: - self.assertEqual(err, actual_err) - return actual_ret, actual_out, actual_err - - -class ToolTest(UnitTestMixin, CheckMixin, unittest.TestCase): - maxDiff = None + return actual_ret, actual_out, actual_err # pragma: no cover def test_help(self): - self.check_cmd(['--help'], returncode=0) + self.check(['--help']) + + # Run again and ignore the error code just to get coverage of + # the test code branches in check(). + self.check(['--help'], returncode=None) def test_inline_expression(self): - self.check_cmd( - ['-c', '{foo: 1}'], returncode=0, out='{\n foo: 1,\n}\n' - ) + self.check(['-c', '{foo: 1}'], out='{\n foo: 1,\n}\n') def test_indent(self): - self.check_cmd( - ['--indent=None', '-c', '[1]'], returncode=0, out='[1]\n' - ) - self.check_cmd( - ['--indent=2', '-c', '[1]'], returncode=0, out='[\n 1,\n]\n' - ) - self.check_cmd( - ['--indent= ', '-c', '[1]'], returncode=0, out='[\n 1,\n]\n' - ) + self.check(['--indent=None', '-c', '[1]'], out='[1]\n') + self.check(['--indent=2', '-c', '[1]'], out='[\n 1,\n]\n') + self.check(['--indent= ', '-c', '[1]'], out='[\n 1,\n]\n') def test_as_json(self): - self.check_cmd( + self.check( ['--as-json', '-c', '{foo: 1}'], - returncode=0, out='{\n "foo": 1\n}\n', ) def test_quote_keys(self): - self.check_cmd( + self.check( ['--quote-keys', '-c', '{foo: 1}'], - returncode=0, out='{\n "foo": 1,\n}\n', ) def test_no_quote_keys(self): - self.check_cmd( + self.check( ['--no-quote-keys', '-c', '{foo: 1}'], - returncode=0, out='{\n foo: 1,\n}\n', ) def test_keys_are_quoted_by_default(self): - self.check_cmd( - ['-c', '{foo: 1}'], returncode=0, out='{\n foo: 1,\n}\n' - ) + self.check(['-c', '{foo: 1}'], out='{\n foo: 1,\n}\n') def test_read_command(self): - self.check_cmd(['-c', '"foo"'], returncode=0, out='"foo"\n') + self.check(['-c', '"foo"'], out='"foo"\n') def test_read_from_stdin(self): - self.check_cmd([], stdin='"foo"\n', returncode=0, out='"foo"\n') + self.check([], stdin='"foo"\n', out='"foo"\n') def test_read_from_a_file(self): files = { 'foo.json5': '"foo"\n', } - self.check_cmd(['foo.json5'], files=files, returncode=0, out='"foo"\n') + self.check(['foo.json5'], files=files, out='"foo"\n') def test_trailing_commas(self): - self.check_cmd( + self.check( ['--trailing-commas', '-c', '{foo: 1}'], - returncode=0, out='{\n foo: 1,\n}\n', ) def test_no_trailing_commas(self): - self.check_cmd( + self.check( ['--no-trailing-commas', '-c', '{foo: 1}'], - returncode=0, out='{\n foo: 1\n}\n', ) def test_trailing_commas_are_there_by_default(self): - self.check_cmd( - ['-c', '{foo: 1}'], returncode=0, out='{\n foo: 1,\n}\n' - ) + self.check(['-c', '{foo: 1}'], out='{\n foo: 1,\n}\n') def test_unknown_switch(self): - self.check_cmd( + self.check( ['--unknown-switch'], returncode=2, - err='json5: error: unrecognized arguments: ' - '--unknown-switch\n\n', + err=( + 'usage: json5 [options] [FILE]\n' + ' -h/--help for help\n' + '\n' + 'error: unrecognized arguments: --unknown-switch\n' + ), ) def test_version(self): - self.check_cmd( - ['--version'], returncode=0, out=str(json5.VERSION) + '\n' - ) - self.check_cmd( - ['--version'], returncode=0, out=str(json5.__version__) + '\n' - ) + self.check(['--version'], out=str(VERSION) + '\n') + self.check(['--version'], out=str(__version__) + '\n') if __name__ == '__main__': # pragma: no cover