-
Notifications
You must be signed in to change notification settings - Fork 31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Typed JSON API #87
base: master
Are you sure you want to change the base?
Typed JSON API #87
Changes from 2 commits
afc5526
4e62665
70e8846
47dcc48
d174d88
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from typing import Union, Iterable, Sequence, Any, Optional, Iterator | ||
from typing import Any, Iterable, Dict, List, Optional, Iterator, Union, Type, cast | ||
import sys | ||
import json as _builtin_json | ||
import gzip | ||
|
@@ -39,6 +39,32 @@ def json_loads(data: Union[str, bytes]) -> JSONOutput: | |
return ujson.loads(data) | ||
|
||
|
||
def json_loads_dict(data: Union[str, bytes]) -> Dict[str, Any]: | ||
"""Deserialize unicode or bytes to a Python dict. | ||
|
||
data (str / bytes): The data to deserialize. | ||
RAISES: ValueError if the loaded data is not a dict | ||
RETURNS: The deserialized Python dict. | ||
""" | ||
obj = json_loads(data) | ||
if not isinstance(obj, dict): | ||
raise ValueError("JSON data could not be parsed to a dict.") | ||
return obj | ||
|
||
|
||
def json_loads_list(data: Union[str, bytes]) -> List[Dict[str, Any]]: | ||
"""Deserialize unicode or bytes to a Python list of dicts. | ||
|
||
data (str / bytes): The data to deserialize. | ||
RAISES: ValueError if the loaded data is not a list | ||
RETURNS: The deserialized Python list. | ||
""" | ||
loaded = json_loads(data) | ||
if not isinstance(loaded, list): | ||
raise ValueError("JSON data could not be parsed to a list of dicts.") | ||
return loaded | ||
|
||
|
||
def read_json(path: FilePath) -> JSONOutput: | ||
"""Load JSON from file or standard input. | ||
|
||
|
@@ -53,6 +79,40 @@ def read_json(path: FilePath) -> JSONOutput: | |
return ujson.load(f) | ||
|
||
|
||
def read_json_dict(path: FilePath) -> Dict[str, Any]: | ||
"""Load JSON from file or standard input. | ||
|
||
path (FilePath): The file path. "-" for reading from stdin. | ||
RETURNS (JSONOutput): The loaded JSON content. | ||
""" | ||
data = read_json(path) | ||
if not isinstance(data, dict): | ||
raise ValueError("Invalid JSON, data could not be parsed to a dict.") | ||
return data | ||
|
||
|
||
def read_json_list(path: FilePath, validate_inner: bool = False, skip_invalid: bool = False) -> List[Dict[str, Any]]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is still a bit of a typing problem because the return type depends on the value of I think this method with these options and types may be getting too specific for inclusion in the If it is going to be in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed that was weird. I fixed these types and separated this logic into 2 functions. This parsing as a list idea is kinda extra honestly. I think the main functions I really think should live in srsly are:
|
||
"""Load JSON from file or standard input. | ||
|
||
path (FilePath): The file path. "-" for reading from stdin. | ||
RETURNS (JSONOutput): The loaded JSON content. | ||
""" | ||
|
||
data = read_json(path) | ||
err_msg = "Invalid JSON, data could not be parsed to a list of dicts." | ||
if not isinstance(data, list): | ||
raise ValueError(err_msg) | ||
|
||
output = [] | ||
for i, obj in enumerate(data): | ||
if not isinstance(obj, dict): | ||
if skip_invalid: | ||
continue | ||
raise ValueError(f"Invalid JSON Object at index: {i + 1}. Value is not a valid dict.") | ||
output.append(obj) | ||
return data | ||
|
||
|
||
def read_gzip_json(path: FilePath) -> JSONOutput: | ||
"""Load JSON from a gzipped file. | ||
|
||
|
@@ -149,6 +209,22 @@ def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]: | |
yield line | ||
|
||
|
||
def read_jsonl_dicts(path: FilePath, skip: bool = False) -> Iterable[Dict[str, Any]]: | ||
"""Read a .jsonl file or standard input and yield contents line by line. | ||
Blank lines will always be skipped. Validates the contents of each line is a dict. | ||
|
||
path (FilePath): The file path. "-" for reading from stdin. | ||
skip (bool): Skip broken lines and don't raise ValueError. | ||
YIELDS (JSONOutput): The loaded JSON contents of each line. | ||
""" | ||
for i, line in enumerate(read_jsonl(path, skip=skip)): | ||
if not isinstance(line, dict): | ||
if skip: | ||
continue | ||
raise ValueError(f"Invalid JSON Object on line: {i + 1}. Line is not a valid dict.") | ||
yield line | ||
|
||
|
||
def write_jsonl( | ||
path: FilePath, | ||
lines: Iterable[JSONInput], | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This and
read_json_list
only checkList
notList[Dict[str, Any]]
? I wouldn't expect a function that's calledread_json_list
to have the additional dict behavior/type?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right this part is a little weird. Again, what I want 99% of the time when calling
read_json_list
is to have a list of dicts. But the naming does leave something to be desired. There is some validation in place for theread_json_list
function, just not for thejson_loads_list
. I can copy the validation over there though.Is
read_json_dicts
a better name here? similar to theread_jsonl_dicts
.