feat: improve documentation

CentreForDigitalHumanities · Dec 19, 2024 · 48401a0 · 48401a0
1 parent 5b2b42a
commit 48401a0
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 8 deletions.
diff --git a/docs/api.md b/docs/api.md
@@ -36,6 +36,12 @@ __Module:__ `ianalyzer_readers.readers.rdf`
 
 ::: ianalyzer_readers.readers.rdf
 
+## JSON reader
+
+__Module:__ `ianalyzer_readers.readers.json`
+
+::: ianalyzer_readers.readers.json
+
 ## Extractors
 
 __Module:__ `ianalyzer_readers.extract`

diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -493,8 +493,10 @@ def _apply(self, metadata, *nargs, **kwargs):
 
 
 class JSON(Extractor):
-    '''An extractor to extract data from JSON
-    This extractor assumes that each source is a flat dictionary
+    '''
+    An extractor to extract data from JSON.
+    This extractor assumes that each source is dictionary without nested lists.
+    When working with nested lists, use JSONReader to unnest.
 
     Parameters:
         keys (Iterable[str]): the keys with which to retrieve a field value from the source

diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py
@@ -1,3 +1,10 @@
+'''
+This module defines the JSONReader.
+
+It can parse documents nested in one file, for which it uses the pandas library,
+or multiple files with one document each, which use the generic Python json parser.
+'''
+
 import json
 from os.path import isfile
 from typing import Iterable, List, Optional, Union
@@ -9,21 +16,86 @@
 import ianalyzer_readers.extract as extract
 
 class JSONReader(Reader):
-    """
+    '''
     A base class for Readers of JSON encoded data.
 
-    The reader can either be used on a collection of JSON files, in which each file represents a document,
+    The reader can either be used on a collection of JSON files (`single_document=True`), in which each file represents a document,
     or for a JSON file containing lists of documents.
 
-    If the attributes `record_path` and `meta` are passed, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data
+    If the attributes `record_path` and `meta` are set, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data
 
     Attributes:
-        record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; do not define if the corpus is structured as one file per document
-        meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located
+        single_document: indicates whether the data is organized such that a file represents a single document
+        record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; irrelevant if `single_document = True`
+        meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located; irrelevant if `single_document = True`
     """
 
+    Examples:
+        ##### Multiple documents in one file:
+        ```python
+        example_data = {
+            'path': {
+                'sketch': 'Hungarian Phrasebook',
+                'episode': 25,
+                'to': {
+                    'records':
+                        [
+                            {'speech': 'I will not buy this record. It is scratched.', 'character': 'tourist'},
+                            {'speech': "No sir. This is a tobacconist's.", 'character': 'tobacconist'}
+                        ]
+                }
+            }
+        }
+
+        MyJSONReader(JSONReader):
+            record_path = ['path', 'to', 'records']
+            meta = [['path', 'sketch'], ['path', 'episode']]
+
+            speech = Field('speech', JSON('speech'))
+            character = Field('character', JSON('character'))
+            sketch = Field('sketch', JSON('path.sketch')) # field name results from paths in `meta` array, separated by a dot
+            episode = Field('episode', JSON('path.episode'))
+        ```
+
+        ##### Single document per file:
+        ```python
+        example_data = {
+            'sketch': 'Hungarian Phrasebook',
+            'episode': 25,
+            'scene': {
+                'character': 'tourist',
+                'speech': 'I will not buy this record. It is scratched.'
+            }
+        }
+
+        MyJSONReader(JSONReader):
+            single_document = True
+
+            speech = Field('speech', JSON('scene', 'speech'))
+            character = Field('character', JSON('scene', 'character))
+            sketch = Field('sketch', JSON('sketch'))
+            episode = Field('episode', JSON('episode))
+        ```
+
+    '''
+
+    single_document: bool = False
+    '''
+    set to `True` if the data is structured such that one document is encoded in one .json file
+    in that case, the reader assumes that there are no lists in such a file
+    '''
+
     record_path: Optional[List[str]] = None
+    '''
+    a keyword or list of keywords by which a list of documents can be extracted from a large JSON file.
+    Only relevant if `single_document=False`.
+    '''
+
     meta: Optional[List[Union[str, List[str]]]] = None
+    '''
+    a list of keywords, or list of lists of keywords, by which metadata for each document can be located,
+    if it is in a different path than `record_path`. Only relevant if `single_document=False`.
+    '''
 
     def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
         """
@@ -42,7 +114,7 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
             metadata = None
             json_data = self._get_json_data(source)
 
-        if self.record_path and self.meta:
+        if not self.single_document:
             documents = json_normalize(json_data, self.record_path, self.meta).to_dict(
                 'records'
             )