Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Typed JSON API #87

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 77 additions & 1 deletion srsly/_json_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, Iterable, Sequence, Any, Optional, Iterator
from typing import Any, Iterable, Dict, List, Optional, Iterator, Union, Type, cast
import sys
import json as _builtin_json
import gzip
Expand Down Expand Up @@ -39,6 +39,32 @@ def json_loads(data: Union[str, bytes]) -> JSONOutput:
return ujson.loads(data)


def json_loads_dict(data: Union[str, bytes]) -> Dict[str, Any]:
"""Deserialize unicode or bytes to a Python dict.

data (str / bytes): The data to deserialize.
RAISES: ValueError if the loaded data is not a dict
RETURNS: The deserialized Python dict.
"""
obj = json_loads(data)
if not isinstance(obj, dict):
raise ValueError("JSON data could not be parsed to a dict.")
return obj


def json_loads_list(data: Union[str, bytes]) -> List[Dict[str, Any]]:
Copy link
Contributor

@adrianeboyd adrianeboyd Mar 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and read_json_list only check List not List[Dict[str, Any]]? I wouldn't expect a function that's called read_json_list to have the additional dict behavior/type?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right this part is a little weird. Again, what I want 99% of the time when calling read_json_list is to have a list of dicts. But the naming does leave something to be desired. There is some validation in place for the read_json_list function, just not for the json_loads_list. I can copy the validation over there though.

Is read_json_dicts a better name here? similar to the read_jsonl_dicts.

"""Deserialize unicode or bytes to a Python list of dicts.

data (str / bytes): The data to deserialize.
RAISES: ValueError if the loaded data is not a list
RETURNS: The deserialized Python list.
"""
loaded = json_loads(data)
if not isinstance(loaded, list):
raise ValueError("JSON data could not be parsed to a list of dicts.")
return loaded


def read_json(path: FilePath) -> JSONOutput:
"""Load JSON from file or standard input.

Expand All @@ -53,6 +79,40 @@ def read_json(path: FilePath) -> JSONOutput:
return ujson.load(f)


def read_json_dict(path: FilePath) -> Dict[str, Any]:
"""Load JSON from file or standard input.

path (FilePath): The file path. "-" for reading from stdin.
RETURNS (JSONOutput): The loaded JSON content.
"""
data = read_json(path)
if not isinstance(data, dict):
raise ValueError("Invalid JSON, data could not be parsed to a dict.")
return data


def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: bool = False) -> List[Dict[str, Any]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still a bit of a typing problem because the return type depends on the value of validate_inner.

I think this method with these options and types may be getting too specific for inclusion in the srsly API.

If it is going to be in srsly, then I think it would be better as read_json_list() -> List and read_json_list_of_dicts() -> List[Dict[str, JSONOutput]] or something along those lines?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed that was weird. I fixed these types and separated this logic into 2 functions.

This parsing as a list idea is kinda extra honestly. I think the main functions I really think should live in srsly are:

  • read_json_dict -> We parse a normal JSON file expecting a JSON object (e.g. for configs and stuff) all the time
  • read_jsonl_dicts -> We use this to load data all the time and expect each line to be a valid JSON object

"""Load JSON from file or standard input.

path (FilePath): The file path. "-" for reading from stdin.
RETURNS (JSONOutput): The loaded JSON content.
"""

data = read_json(path)
err_msg = "Invalid JSON, data could not be parsed to a list of dicts."
if not isinstance(data, list):
raise ValueError(err_msg)

output = []
for i, obj in enumerate(data):
if not isinstance(obj, dict):
if skip_invalid:
continue
raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.")
output.append(obj)
return data


def read_gzip_json(path: FilePath) -> JSONOutput:
"""Load JSON from a gzipped file.

Expand Down Expand Up @@ -149,6 +209,22 @@ def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
yield line


def read_jsonl_dicts(path: FilePath, skip: bool = False) -> Iterable[Dict[str, Any]]:
"""Read a .jsonl file or standard input and yield contents line by line.
Blank lines will always be skipped. Validates the contents of each line is a dict.

path (FilePath): The file path. "-" for reading from stdin.
skip (bool): Skip broken lines and don't raise ValueError.
YIELDS (JSONOutput): The loaded JSON contents of each line.
"""
for i, line in enumerate(read_jsonl(path, skip=skip)):
if not isinstance(line, dict):
if skip:
continue
raise ValueError(f"Invalid JSON Object on line: {i + 1}. Line is not a valid dict.")
yield line


def write_jsonl(
path: FilePath,
lines: Iterable[JSONInput],
Expand Down
42 changes: 42 additions & 0 deletions srsly/tests/test_json_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
import gzip
import numpy

from typing import Any, Dict, List, Union

from .._json_api import (
read_json,
read_json_dict,
read_json_list,
read_jsonl_dicts,
write_json,
read_jsonl,
write_jsonl,
Expand Down Expand Up @@ -262,3 +267,40 @@ def test_read_jsonl_gzip():
assert len(data[1]) == 1
assert data[0]["hello"] == "world"
assert data[1]["test"] == 123


READ_JSON_DICT_TEST_CASES = {

}


READ_JSONL_DICT_TEST_CASES = {
"invalid_str": ('"test"', ValueError()),
"invalid_num": ('-32', ValueError()),
"invalid_json_list": ('[{"hello": "world"}\n{"test": 123}]', ValueError()),
"valid_dicts": ('{"hello": "world"}\n{"test": 123}', [{"hello": "world"}, {"test": 123}]),
}

@pytest.mark.parametrize(
"file_contents, expected",
READ_JSONL_DICT_TEST_CASES.values(),
ids=READ_JSONL_DICT_TEST_CASES.keys()
)
def test_read_jsonl_dicts(file_contents: str, expected: Union[List[Dict[str, Any]], ValueError]):

with make_tempdir({"tmp.json": file_contents}) as temp_dir:
file_path = temp_dir / "tmp.json"
assert file_path.exists()
data = read_jsonl_dicts(file_path)
# Make sure this returns a generator, not just a list
assert not hasattr(data, "__len__")
try:
# actually consume the generator to trigger errors
data = list(data)
except ValueError:
assert isinstance(expected, ValueError)
else:
assert isinstance(expected, list)
assert len(data) == len(expected)
for data_item, expected_item in zip(data, expected):
assert data_item == expected_item