CentreForDigitalHumanities · BeritJanssen · Jan 23, 2025 · Aug 29, 2024 · Sep 19, 2024 · Sep 26, 2024
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/README.md b/README.md
@@ -3,15 +3,15 @@
 [![Python package](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml/badge.svg)](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml)
 [![Documentation Status](https://readthedocs.org/projects/ianalyzer-readers/badge/?version=latest)](https://ianalyzer-readers.readthedocs.io/en/latest/?badge=latest)
 
-`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, XLSX or TTL files.
+`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, JSON, XLSX or RDF (Linked Data) files.
 
 This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type.
 
 The basic usage is that you will use the utilities in this package to create a "reader" class. You specify what your data looks like, and then call the `documents()` method of the reader to get an iterator of documents - where each document is a flat dictionary of key/value pairs.
 
 ## Prerequisites
 
-Requires Python 3.8 or later.
+Requires Python 3.9 or later.
 
 ## Contents
 
@@ -25,7 +25,7 @@ Our primary use for this package is to pre-process data for I-analyzer, but you
 
 Using this package makes sense if you want to extract data in the shape that it is designed for (i.e., a list of flat dictionaries).
 
-What we find especially useful is that all subclasses of `Reader` have the same interface - regardless of whether they are processing CSV, XML, HTML, or XLSX data. That common interface is crucial in an application that needs to process corpora from different source types, like I-analyzer.
+What we find especially useful is that all subclasses of `Reader` have the same interface - regardless of whether they are processing CSV, JSON, XML, HTML, RDF or XLSX data. That common interface is crucial in an application that needs to process corpora from different source types, like I-analyzer.
 
 ## Usage
 

diff --git a/docs/api.md b/docs/api.md
@@ -36,6 +36,12 @@ __Module:__ `ianalyzer_readers.readers.rdf`
 
 ::: ianalyzer_readers.readers.rdf
 
+## JSON reader
+
+__Module:__ `ianalyzer_readers.readers.json`
+
+::: ianalyzer_readers.readers.json
+
 ## Extractors
 
 __Module:__ `ianalyzer_readers.extract`

diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -467,6 +467,7 @@ def format(self, value):
         if value and value not in self.convert_to_none:
             return value
 
+
 class ExternalFile(Extractor):
     '''
     Free for all external file extractor that provides a stream to `stream_handler`
@@ -491,6 +492,29 @@ def _apply(self, metadata, *nargs, **kwargs):
         return self.stream_handler(open(metadata['associated_file'], 'r'))
 
 
+class JSON(Extractor):
+    '''
+    An extractor to extract data from JSON.
+    This extractor assumes that each source is dictionary without nested lists.
+    When working with nested lists, use JSONReader to unnest.
+
+    Parameters:
+        keys (Iterable[str]): the keys with which to retrieve a field value from the source
+    '''
+
+    def __init__(self, *keys, **kwargs):
+        self.keys = list(keys)
+        super().__init__(**kwargs)
+
+    def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str:
+        key = self.keys[key_index]
+        data = data.get(key)
+        if len(self.keys) > key_index + 1:
+            key_index += 1
+            return self._apply(data, key_index)
+        return data
+
+
 class RDF(Extractor):
     """An extractor to extract data from RDF triples
 

diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py
@@ -12,18 +12,25 @@
 import logging
 import csv
 
+from requests import Response
+
 logging.basicConfig(level=logging.WARNING)
-logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG)
+logger = logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG)
+
+SourceData = Union[str, Response, bytes]
+'''Type definition of the data types a Reader method can handle.'''
 
-Source = Union[str, Tuple[Union[str, bytes], Dict], bytes]
+Source = Union[SourceData, Tuple[SourceData, Dict]]
 '''
 Type definition for the source input to some Reader methods.
 
 Sources are either:
 
 - a string with the path to a filename
-- a tuple containing a path to a filename, and a dictionary with metadata
-- binary data with the file contents. This is not supported on all Reader subclasses.
+- binary data with the file contents. This is not supported on all Reader subclasses
+- a requests.Response
+- a tuple of one of the above, and a dictionary with metadata
+
 '''
 
 Document = Dict[str, Any]

diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py
@@ -0,0 +1,153 @@
+'''
+This module defines the JSONReader.
+
+It can parse documents nested in one file, for which it uses the pandas library,
+or multiple files with one document each, which use the generic Python json parser.
+'''
+
+import json
+from os.path import isfile
+from typing import Iterable, List, Optional, Union
+
+from pandas import json_normalize
+from requests import Response
+
+from .core import Reader, Document, Source
+import ianalyzer_readers.extract as extract
+
+class JSONReader(Reader):
+    '''
+    A base class for Readers of JSON encoded data.
+
+    The reader can either be used on a collection of JSON files (`single_document=True`), in which each file represents a document,
+    or for a JSON file containing lists of documents.
+
+    If the attributes `record_path` and `meta` are set, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data.
+
+    Attributes:
+        single_document: indicates whether the data is organized such that a file represents a single document
+        record_path: a path or list of paths by which a list of documents can be extracted from a large JSON file; irrelevant if `single_document = True`
+        meta: a list of paths, or list of lists of paths, by which metadata common for all documents can be located; irrelevant if `single_document = True`
+    """
+
+    Examples:
+        ### Multiple documents in one file:
+        ```python
+        example_data = {
+            'path': {
+                'sketch': 'Hungarian Phrasebook',
+                'episode': 25,
+                'to': {
+                    'records':
+                        [
+                            {'speech': 'I will not buy this record. It is scratched.', 'character': 'tourist'},
+                            {'speech': "No sir. This is a tobacconist's.", 'character': 'tobacconist'}
+                        ]
+                }
+            }
+        }
+
+        MyJSONReader(JSONReader):
+            record_path = ['path', 'to', 'records']
+            meta = [['path', 'sketch'], ['path', 'episode']]
+
+            speech = Field('speech', JSON('speech'))
+            character = Field('character', JSON('character'))
+            sketch = Field('sketch', JSON('path.sketch'))
+            episode = Field('episode', JSON('path.episode'))
+        ```
+        To define the paths used to extract the field values, consider the dataformat the `pandas.json_normalize` creates:
+        a table with each row representing a document, and columns corresponding to paths, either relative to documents within `record_path`,
+        or relative to the top level (`meta`), with list of paths indicated by dots.
+        ```csv
+        row,speech,character,path.sketch,path.episode
+        0,"I will not buy this record. It is scratched.","tourist","Hungarian Phrasebook",25
+        1,"No sir. This is a tobacconist's.","tobacconist","Hungarian Phrasebook",25
+        ```
+
+        ### Single document per file:
+        ```python
+        example_data = {
+            'sketch': 'Hungarian Phrasebook',
+            'episode': 25,
+            'scene': {
+                'character': 'tourist',
+                'speech': 'I will not buy this record. It is scratched.'
+            }
+        }
+
+        MyJSONReader(JSONReader):
+            single_document = True
+
+            speech = Field('speech', JSON('scene', 'speech'))
+            character = Field('character', JSON('scene', 'character))
+            sketch = Field('sketch', JSON('sketch'))
+            episode = Field('episode', JSON('episode))
+        ```
+
+    '''
+
+    single_document: bool = False
+    '''
+    set to `True` if the data is structured such that one document is encoded in one .json file
+    in that case, the reader assumes that there are no lists in such a file
+    '''
+
+    record_path: Optional[List[str]] = None
+    '''
+    a keyword or list of keywords by which a list of documents can be extracted from a large JSON file.
+    Only relevant if `single_document=False`.
+    '''
+
+    meta: Optional[List[Union[str, List[str]]]] = None
+    '''
+    a list of keywords, or list of lists of keywords, by which metadata for each document can be located,
+    if it is in a different path than `record_path`. Only relevant if `single_document=False`.
+    '''
+
+    def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
+        """
+        Given a Python dictionary, returns an iterable of extracted documents.
+
+        Parameters:
+            source: the input data
+
+        Returns:
+            list of documents
+        """
+        if isinstance(source, tuple):
+            metadata = source[1]
+            json_data = self._get_json_data(source[0])
+        else:
+            metadata = None
+            json_data = self._get_json_data(source)
+
+        if not self.single_document:
+            documents = json_normalize(
+                json_data, record_path=self.record_path, meta=self.meta
+            ).to_dict('records')
+        else:
+            documents = [json_data]
+
+        self._reject_extractors(extract.XML, extract.CSV, extract.RDF)
+
+        for doc in documents:
+            field_dict = {
+                field.name: field.extractor.apply(
+                    doc, metadata=metadata, *nargs, **kwargs
+                )
+                for field in self.fields
+            }
+
+            yield field_dict
+
+    def _get_json_data(self, source: Source) -> dict:
+        if isfile(source):
+            with open(source, "r") as f:
+                return json.load(f)
+        elif isinstance(source, Response):
+            return source.json()
+        elif isinstance(source, bytes):
+            return json.loads(source)
+        else:
+            raise Exception("Unexpected source type for JSON Reader")
diff --git a/ianalyzer_readers/readers/rdf.py b/ianalyzer_readers/readers/rdf.py
@@ -35,8 +35,8 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
                 are based on the extractor of each field.
         '''
         self._reject_extractors(extract.CSV, extract.XML)
-        
-        if type(source) == bytes:
+
+        if isinstance(source, bytes):
             raise Exception('The current reader cannot handle sources of bytes type, provide a file path as string instead')
         try:
             (filename, metadata) = source
@@ -45,12 +45,14 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
             metadata = None
 
         logger.info(f"parsing {filename}")
-        g = self.parse_graph_from_filename(filename)
-
+        g = self.parse_graph_from_filename(
+            filename
+        )  # TODO: we could also allow Response as source data here, but that would mean the response would also need to include information of the data format, see [this example](https://github.com/RDFLib/rdflib/blob/4.1.2/rdflib/graph.py#L209)
+
         document_subjects = self.document_subjects(g)
         for subject in document_subjects:
             yield self._document_from_subject(g, subject, metadata)
-    
+
     def parse_graph_from_filename(self, filename: str) -> Graph:
         ''' Read a RDF file as indicated by source, return a graph 
         Override this function to parse multiple source files into one graph
@@ -64,7 +66,7 @@ def parse_graph_from_filename(self, filename: str) -> Graph:
         g = Graph()
         g.parse(filename)
         return g
-            
+
     def document_subjects(self, graph: Graph) -> Iterable[Union[BNode, Literal, URIRef]]:
         ''' Override this function to return all subjects (i.e., first part of RDF triple) 
         with which to search for data in the RDF graph.

diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py
@@ -7,6 +7,7 @@
 import bs4
 import logging
 from os.path import isfile
+from requests import Response
 from typing import Dict, Iterable, Tuple, List
 
 from .. import extract
@@ -178,13 +179,20 @@ def _filename_soup_and_metadata_from_source(self, source: Source) -> Tuple[str,
             soup = self._soup_from_data(source)
             filename = None
             metadata = {}
+        elif isinstance(source, Response):
+            soup = self._soup_from_data(source.text)
+            filename = None
+            metadata = {}
         else:
-            if isfile(source[0]):
+            if isinstance(source[0], str):
                 filename = source[0]
                 soup = self._soup_from_xml(filename)
             else:
                 filename = None
-                soup = self._soup_from_data(source[0])
+                if isinstance(source[0], bytes):
+                    soup = self._soup_from_data(source[0])
+                elif isinstance(source[0], Response):
+                    soup = self._soup_from_data(source[0].text)
             metadata = source[1] or None
         return filename, soup, metadata
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
   "beautifulsoup4",
   "lxml",
   "openpyxl",
+  "pandas",
+  "requests",
   "rdflib",
 ]