Skip to content

Commit

Permalink
feat: improve documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Dec 19, 2024
1 parent 5b2b42a commit 48401a0
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 8 deletions.
6 changes: 6 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ __Module:__ `ianalyzer_readers.readers.rdf`

::: ianalyzer_readers.readers.rdf

## JSON reader

__Module:__ `ianalyzer_readers.readers.json`

::: ianalyzer_readers.readers.json

## Extractors

__Module:__ `ianalyzer_readers.extract`
Expand Down
6 changes: 4 additions & 2 deletions ianalyzer_readers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,8 +493,10 @@ def _apply(self, metadata, *nargs, **kwargs):


class JSON(Extractor):
'''An extractor to extract data from JSON
This extractor assumes that each source is a flat dictionary
'''
An extractor to extract data from JSON.
This extractor assumes that each source is dictionary without nested lists.
When working with nested lists, use JSONReader to unnest.
Parameters:
keys (Iterable[str]): the keys with which to retrieve a field value from the source
Expand Down
84 changes: 78 additions & 6 deletions ianalyzer_readers/readers/json.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
'''
This module defines the JSONReader.
It can parse documents nested in one file, for which it uses the pandas library,
or multiple files with one document each, which use the generic Python json parser.
'''

import json
from os.path import isfile
from typing import Iterable, List, Optional, Union
Expand All @@ -9,21 +16,86 @@
import ianalyzer_readers.extract as extract

class JSONReader(Reader):
"""
'''
A base class for Readers of JSON encoded data.
The reader can either be used on a collection of JSON files, in which each file represents a document,
The reader can either be used on a collection of JSON files (`single_document=True`), in which each file represents a document,
or for a JSON file containing lists of documents.
If the attributes `record_path` and `meta` are passed, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data
If the attributes `record_path` and `meta` are set, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data
Attributes:
record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; do not define if the corpus is structured as one file per document
meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located
single_document: indicates whether the data is organized such that a file represents a single document
record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; irrelevant if `single_document = True`
meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located; irrelevant if `single_document = True`
"""
Examples:
##### Multiple documents in one file:
```python
example_data = {
'path': {
'sketch': 'Hungarian Phrasebook',
'episode': 25,
'to': {
'records':
[
{'speech': 'I will not buy this record. It is scratched.', 'character': 'tourist'},
{'speech': "No sir. This is a tobacconist's.", 'character': 'tobacconist'}
]
}
}
}
MyJSONReader(JSONReader):
record_path = ['path', 'to', 'records']
meta = [['path', 'sketch'], ['path', 'episode']]
speech = Field('speech', JSON('speech'))
character = Field('character', JSON('character'))
sketch = Field('sketch', JSON('path.sketch')) # field name results from paths in `meta` array, separated by a dot
episode = Field('episode', JSON('path.episode'))
```
##### Single document per file:
```python
example_data = {
'sketch': 'Hungarian Phrasebook',
'episode': 25,
'scene': {
'character': 'tourist',
'speech': 'I will not buy this record. It is scratched.'
}
}
MyJSONReader(JSONReader):
single_document = True
speech = Field('speech', JSON('scene', 'speech'))
character = Field('character', JSON('scene', 'character))
sketch = Field('sketch', JSON('sketch'))
episode = Field('episode', JSON('episode))
```
'''

single_document: bool = False
'''
set to `True` if the data is structured such that one document is encoded in one .json file
in that case, the reader assumes that there are no lists in such a file
'''

record_path: Optional[List[str]] = None
'''
a keyword or list of keywords by which a list of documents can be extracted from a large JSON file.
Only relevant if `single_document=False`.
'''

meta: Optional[List[Union[str, List[str]]]] = None
'''
a list of keywords, or list of lists of keywords, by which metadata for each document can be located,
if it is in a different path than `record_path`. Only relevant if `single_document=False`.
'''

def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
"""
Expand All @@ -42,7 +114,7 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
metadata = None
json_data = self._get_json_data(source)

if self.record_path and self.meta:
if not self.single_document:
documents = json_normalize(json_data, self.record_path, self.meta).to_dict(
'records'
)
Expand Down

0 comments on commit 48401a0

Please sign in to comment.