diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 55ab63e..8aed826 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index 7de83c6..628ae82 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Python package](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml/badge.svg)](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml) [![Documentation Status](https://readthedocs.org/projects/ianalyzer-readers/badge/?version=latest)](https://ianalyzer-readers.readthedocs.io/en/latest/?badge=latest) -`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, XLSX or TTL files. +`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, JSON, XLSX or RDF (Linked Data) files. This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type. @@ -11,7 +11,7 @@ The basic usage is that you will use the utilities in this package to create a " ## Prerequisites -Requires Python 3.8 or later. +Requires Python 3.9 or later. ## Contents @@ -25,7 +25,7 @@ Our primary use for this package is to pre-process data for I-analyzer, but you Using this package makes sense if you want to extract data in the shape that it is designed for (i.e., a list of flat dictionaries). -What we find especially useful is that all subclasses of `Reader` have the same interface - regardless of whether they are processing CSV, XML, HTML, or XLSX data. That common interface is crucial in an application that needs to process corpora from different source types, like I-analyzer. +What we find especially useful is that all subclasses of `Reader` have the same interface - regardless of whether they are processing CSV, JSON, XML, HTML, RDF or XLSX data. That common interface is crucial in an application that needs to process corpora from different source types, like I-analyzer. ## Usage diff --git a/docs/api.md b/docs/api.md index 6833b96..5859d25 100644 --- a/docs/api.md +++ b/docs/api.md @@ -36,6 +36,12 @@ __Module:__ `ianalyzer_readers.readers.rdf` ::: ianalyzer_readers.readers.rdf +## JSON reader + +__Module:__ `ianalyzer_readers.readers.json` + +::: ianalyzer_readers.readers.json + ## Extractors __Module:__ `ianalyzer_readers.extract` diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index c6d9599..82d29a6 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -467,6 +467,7 @@ def format(self, value): if value and value not in self.convert_to_none: return value + class ExternalFile(Extractor): ''' Free for all external file extractor that provides a stream to `stream_handler` @@ -491,6 +492,29 @@ def _apply(self, metadata, *nargs, **kwargs): return self.stream_handler(open(metadata['associated_file'], 'r')) +class JSON(Extractor): + ''' + An extractor to extract data from JSON. + This extractor assumes that each source is dictionary without nested lists. + When working with nested lists, use JSONReader to unnest. + + Parameters: + keys (Iterable[str]): the keys with which to retrieve a field value from the source + ''' + + def __init__(self, *keys, **kwargs): + self.keys = list(keys) + super().__init__(**kwargs) + + def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str: + key = self.keys[key_index] + data = data.get(key) + if len(self.keys) > key_index + 1: + key_index += 1 + return self._apply(data, key_index) + return data + + class RDF(Extractor): """An extractor to extract data from RDF triples diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index ba175b9..1a3f7eb 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -12,18 +12,25 @@ import logging import csv +from requests import Response + logging.basicConfig(level=logging.WARNING) -logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG) +logger = logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG) + +SourceData = Union[str, Response, bytes] +'''Type definition of the data types a Reader method can handle.''' -Source = Union[str, Tuple[Union[str, bytes], Dict], bytes] +Source = Union[SourceData, Tuple[SourceData, Dict]] ''' Type definition for the source input to some Reader methods. Sources are either: - a string with the path to a filename -- a tuple containing a path to a filename, and a dictionary with metadata -- binary data with the file contents. This is not supported on all Reader subclasses. +- binary data with the file contents. This is not supported on all Reader subclasses +- a requests.Response +- a tuple of one of the above, and a dictionary with metadata + ''' Document = Dict[str, Any] diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py new file mode 100644 index 0000000..a2cba3a --- /dev/null +++ b/ianalyzer_readers/readers/json.py @@ -0,0 +1,153 @@ +''' +This module defines the JSONReader. + +It can parse documents nested in one file, for which it uses the pandas library, +or multiple files with one document each, which use the generic Python json parser. +''' + +import json +from os.path import isfile +from typing import Iterable, List, Optional, Union + +from pandas import json_normalize +from requests import Response + +from .core import Reader, Document, Source +import ianalyzer_readers.extract as extract + +class JSONReader(Reader): + ''' + A base class for Readers of JSON encoded data. + + The reader can either be used on a collection of JSON files (`single_document=True`), in which each file represents a document, + or for a JSON file containing lists of documents. + + If the attributes `record_path` and `meta` are set, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data. + + Attributes: + single_document: indicates whether the data is organized such that a file represents a single document + record_path: a path or list of paths by which a list of documents can be extracted from a large JSON file; irrelevant if `single_document = True` + meta: a list of paths, or list of lists of paths, by which metadata common for all documents can be located; irrelevant if `single_document = True` + """ + + Examples: + ### Multiple documents in one file: + ```python + example_data = { + 'path': { + 'sketch': 'Hungarian Phrasebook', + 'episode': 25, + 'to': { + 'records': + [ + {'speech': 'I will not buy this record. It is scratched.', 'character': 'tourist'}, + {'speech': "No sir. This is a tobacconist's.", 'character': 'tobacconist'} + ] + } + } + } + + MyJSONReader(JSONReader): + record_path = ['path', 'to', 'records'] + meta = [['path', 'sketch'], ['path', 'episode']] + + speech = Field('speech', JSON('speech')) + character = Field('character', JSON('character')) + sketch = Field('sketch', JSON('path.sketch')) + episode = Field('episode', JSON('path.episode')) + ``` + To define the paths used to extract the field values, consider the dataformat the `pandas.json_normalize` creates: + a table with each row representing a document, and columns corresponding to paths, either relative to documents within `record_path`, + or relative to the top level (`meta`), with list of paths indicated by dots. + ```csv + row,speech,character,path.sketch,path.episode + 0,"I will not buy this record. It is scratched.","tourist","Hungarian Phrasebook",25 + 1,"No sir. This is a tobacconist's.","tobacconist","Hungarian Phrasebook",25 + ``` + + ### Single document per file: + ```python + example_data = { + 'sketch': 'Hungarian Phrasebook', + 'episode': 25, + 'scene': { + 'character': 'tourist', + 'speech': 'I will not buy this record. It is scratched.' + } + } + + MyJSONReader(JSONReader): + single_document = True + + speech = Field('speech', JSON('scene', 'speech')) + character = Field('character', JSON('scene', 'character)) + sketch = Field('sketch', JSON('sketch')) + episode = Field('episode', JSON('episode)) + ``` + + ''' + + single_document: bool = False + ''' + set to `True` if the data is structured such that one document is encoded in one .json file + in that case, the reader assumes that there are no lists in such a file + ''' + + record_path: Optional[List[str]] = None + ''' + a keyword or list of keywords by which a list of documents can be extracted from a large JSON file. + Only relevant if `single_document=False`. + ''' + + meta: Optional[List[Union[str, List[str]]]] = None + ''' + a list of keywords, or list of lists of keywords, by which metadata for each document can be located, + if it is in a different path than `record_path`. Only relevant if `single_document=False`. + ''' + + def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: + """ + Given a Python dictionary, returns an iterable of extracted documents. + + Parameters: + source: the input data + + Returns: + list of documents + """ + if isinstance(source, tuple): + metadata = source[1] + json_data = self._get_json_data(source[0]) + else: + metadata = None + json_data = self._get_json_data(source) + + if not self.single_document: + documents = json_normalize( + json_data, record_path=self.record_path, meta=self.meta + ).to_dict('records') + else: + documents = [json_data] + + self._reject_extractors(extract.XML, extract.CSV, extract.RDF) + + for doc in documents: + field_dict = { + field.name: field.extractor.apply( + doc, metadata=metadata, *nargs, **kwargs + ) + for field in self.fields + } + + yield field_dict + + def _get_json_data(self, source: Source) -> dict: + if isfile(source): + with open(source, "r") as f: + return json.load(f) + elif isinstance(source, Response): + return source.json() + elif isinstance(source, bytes): + return json.loads(source) + else: + raise Exception("Unexpected source type for JSON Reader") diff --git a/ianalyzer_readers/readers/rdf.py b/ianalyzer_readers/readers/rdf.py index 755532d..544618c 100644 --- a/ianalyzer_readers/readers/rdf.py +++ b/ianalyzer_readers/readers/rdf.py @@ -35,8 +35,8 @@ def source2dicts(self, source: Source) -> Iterable[Document]: are based on the extractor of each field. ''' self._reject_extractors(extract.CSV, extract.XML) - - if type(source) == bytes: + + if isinstance(source, bytes): raise Exception('The current reader cannot handle sources of bytes type, provide a file path as string instead') try: (filename, metadata) = source @@ -45,12 +45,14 @@ def source2dicts(self, source: Source) -> Iterable[Document]: metadata = None logger.info(f"parsing {filename}") - g = self.parse_graph_from_filename(filename) - + g = self.parse_graph_from_filename( + filename + ) # TODO: we could also allow Response as source data here, but that would mean the response would also need to include information of the data format, see [this example](https://github.com/RDFLib/rdflib/blob/4.1.2/rdflib/graph.py#L209) + document_subjects = self.document_subjects(g) for subject in document_subjects: yield self._document_from_subject(g, subject, metadata) - + def parse_graph_from_filename(self, filename: str) -> Graph: ''' Read a RDF file as indicated by source, return a graph Override this function to parse multiple source files into one graph @@ -64,7 +66,7 @@ def parse_graph_from_filename(self, filename: str) -> Graph: g = Graph() g.parse(filename) return g - + def document_subjects(self, graph: Graph) -> Iterable[Union[BNode, Literal, URIRef]]: ''' Override this function to return all subjects (i.e., first part of RDF triple) with which to search for data in the RDF graph. diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py index 048a2b4..e9122af 100644 --- a/ianalyzer_readers/readers/xml.py +++ b/ianalyzer_readers/readers/xml.py @@ -7,6 +7,7 @@ import bs4 import logging from os.path import isfile +from requests import Response from typing import Dict, Iterable, Tuple, List from .. import extract @@ -178,13 +179,20 @@ def _filename_soup_and_metadata_from_source(self, source: Source) -> Tuple[str, soup = self._soup_from_data(source) filename = None metadata = {} + elif isinstance(source, Response): + soup = self._soup_from_data(source.text) + filename = None + metadata = {} else: - if isfile(source[0]): + if isinstance(source[0], str): filename = source[0] soup = self._soup_from_xml(filename) else: filename = None - soup = self._soup_from_data(source[0]) + if isinstance(source[0], bytes): + soup = self._soup_from_data(source[0]) + elif isinstance(source[0], Response): + soup = self._soup_from_data(source[0].text) metadata = source[1] or None return filename, soup, metadata diff --git a/pyproject.toml b/pyproject.toml index f52f069..26cf5a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,8 @@ dependencies = [ "beautifulsoup4", "lxml", "openpyxl", + "pandas", + "requests", "rdflib", ] diff --git a/requirements.txt b/requirements.txt index 4ac765f..575d362 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,10 @@ # beautifulsoup4==4.12.3 # via ianalyzer_readers (setup.py) +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests click==8.1.7 # via # mkdocs @@ -20,6 +24,8 @@ ghp-import==2.1.0 # via mkdocs griffe==0.42.0 # via mkdocstrings-python +idna==3.10 + # via requests iniconfig==2.0.0 # via pytest isodate==0.6.1 @@ -56,12 +62,16 @@ mkdocstrings==0.24.1 # via mkdocstrings-python mkdocstrings-python==1.9.0 # via ianalyzer_readers (setup.py) +numpy==2.1.3 + # via pandas openpyxl==3.1.2 # via ianalyzer_readers (setup.py) packaging==24.0 # via # mkdocs # pytest +pandas==2.2.3 + # via ianalyzer_readers (setup.py) pathspec==0.12.1 # via mkdocs platformdirs==4.2.0 @@ -77,7 +87,11 @@ pyparsing==3.1.2 pytest==8.1.1 # via ianalyzer_readers (setup.py) python-dateutil==2.9.0.post0 - # via ghp-import + # via + # ghp-import + # pandas +pytz==2024.2 + # via pandas pyyaml==6.0.1 # via # mkdocs @@ -87,6 +101,8 @@ pyyaml-env-tag==0.1 # via mkdocs rdflib==7.0.0 # via ianalyzer_readers (setup.py) +requests==2.32.3 + # via ianalyzer_readers (setup.py) six==1.16.0 # via # isodate @@ -95,5 +111,9 @@ soupsieve==2.5 # via beautifulsoup4 tomli==2.0.1 # via pytest +tzdata==2024.2 + # via pandas +urllib3==2.2.3 + # via requests watchdog==4.0.0 # via mkdocs diff --git a/tests/json/data/Macbeth.json b/tests/json/data/Macbeth.json new file mode 100644 index 0000000..b290e09 --- /dev/null +++ b/tests/json/data/Macbeth.json @@ -0,0 +1,61 @@ +{"TITLE":"ACT I", +"SCENE":[ + { + "TITLE":"SCENE I. A desert place.", + "STAGEDIR":[ + "Thunder and lightning. Enter three Witches", + "Exeunt" + ], + "SPEECH":[ + { + "SPEAKER":"First Witch", + "LINE":[ + "When shall we three meet again", + "In thunder, lightning, or in rain?" + ] + }, + { + "SPEAKER":"Second Witch", + "LINE":[ + "When the hurlyburly's done,", + "When the battle's lost and won." + ] + }, + { + "SPEAKER":"Third Witch", + "LINE":"That will be ere the set of sun." + }, + { + "SPEAKER":"First Witch", + "LINE":"Where the place?" + }, + { + "SPEAKER":"Second Witch", + "LINE":"Upon the heath." + }, + { + "SPEAKER":"Third Witch", + "LINE":"There to meet with Macbeth." + }, + { + "SPEAKER":"First Witch", + "LINE":"I come, Graymalkin!" + }, + { + "SPEAKER":"Second Witch", + "LINE":"Paddock calls." + }, + { + "SPEAKER":"Third Witch", + "LINE":"Anon." + }, + { + "SPEAKER":"ALL", + "LINE":[ + "Fair is foul, and foul is fair:", + "Hover through the fog and filthy air." + ] + } + ] + }] +} \ No newline at end of file diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py new file mode 100644 index 0000000..3ad67c5 --- /dev/null +++ b/tests/json/json_reader.py @@ -0,0 +1,69 @@ +from glob import glob +import json +import os +from typing import Union + +from ianalyzer_readers.extract import JSON +from ianalyzer_readers.readers.core import Field +from ianalyzer_readers.readers.json import JSONReader + + +def merge_lines(lines: Union[list, str]) -> str: + if isinstance(lines, list): + return "\n".join(lines) + return lines + + +class JSONDocumentReader(JSONReader): + """ + Example reader that would operate on corpora with one json file per document + """ + + data_directory = os.path.join(os.path.dirname(__file__), "data") + single_document = True + + def sources(self, **kwargs): + for i in range(1): + data = json.dumps( + { + "TITLE": "ACT I", + "SCENE": { + "TITLE": "SCENE I. A desert place.", + "STAGEDIR": [ + "Thunder and lightning. Enter three Witches", + "Exeunt", + ], + "SPEECH": { + "SPEAKER": "First Witch", + }, + }, + } + ) + yield data.encode('utf-8') + + act = Field("act", JSON("TITLE")) + character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER")) + scene = Field("scene", JSON("SCENE", "TITLE")) + + fields = [act, character, scene] + + +class JSONMultipleDocumentReader(JSONReader): + """ + Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare + """ + data_directory = os.path.join(os.path.dirname(__file__), "data") + record_path = ["SCENE", "SPEECH"] + meta = ["TITLE", ["SCENE", "TITLE"], ["SCENE", "STAGEDIR"]] + + def sources(self, **kwargs): + for filename in glob(f"{self.data_directory}/*.json"): + yield filename + + act = Field("act", JSON("TITLE")) + scene = Field("scene", JSON("SCENE.TITLE")) + character = Field("character", JSON("SPEAKER")) + lines = Field("lines", JSON("LINE", transform=merge_lines)) + stage_dir = Field("stage_direction", JSON("SCENE.STAGEDIR", transform=merge_lines)) + + fields = [act, scene, character, lines, stage_dir] diff --git a/tests/test_json_reader.py b/tests/test_json_reader.py new file mode 100644 index 0000000..d8111cb --- /dev/null +++ b/tests/test_json_reader.py @@ -0,0 +1,43 @@ +from tests.json.json_reader import JSONDocumentReader, JSONMultipleDocumentReader + +expected = [ + { + 'act': 'ACT I', + 'scene': 'SCENE I. A desert place.', + 'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt', + 'character': 'First Witch', + 'lines': 'When shall we three meet again\nIn thunder, lightning, or in rain?', + }, + *[{}] * 8, + { + 'act': 'ACT I', + 'scene': 'SCENE I. A desert place.', + 'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt', + 'character': 'ALL', + 'lines': "Fair is foul, and foul is fair:\nHover through the fog and filthy air.", + }, +] + + +def test_json_parse_single_document(): + reader = JSONDocumentReader() + docs = list(reader.documents()) + assert len(docs) == 1 + assert docs[0].get('act') == 'ACT I' + assert docs[0].get('character') == 'First Witch' + assert docs[0].get('scene') == 'SCENE I. A desert place.' + + +def test_json_parse_multiple_documents(): + '''test that JSON reader can parse multiple documents from an array in a single file''' + reader = JSONMultipleDocumentReader() + docs = list(reader.documents()) + assert len(docs) == len(expected) + _assert_matches(expected[0], docs[0]) + _assert_matches(expected[-1], docs[-1]) + + +def _assert_matches(target: dict, doc: dict): + assert len(target.keys()) == len(doc.keys()) + for key in target.keys(): + assert doc.get(key) == target.get(key) diff --git a/tests/xml/test_xml_reader.py b/tests/xml/test_xml_reader.py index 262505c..a0288a4 100644 --- a/tests/xml/test_xml_reader.py +++ b/tests/xml/test_xml_reader.py @@ -1,5 +1,7 @@ import os +import requests + from ianalyzer_readers.readers.xml import XMLReader from ianalyzer_readers.readers.core import Field from ianalyzer_readers.extract import XML @@ -44,6 +46,16 @@ def sources(self, **kwargs): fields = [title, character, lines] +url_list = ['mock_path'] + + +class HamletXMLResponseReader(HamletXMLReader): + def sources(self, **kwargs): + for document_url in url_list: + response = requests.get(document_url) + yield response + + target_documents = [ { 'title': 'Hamlet', @@ -94,3 +106,22 @@ def test_xml_reader(): for doc, target in zip(docs, target_documents): assert doc == target + + +class MockResponse(requests.Response): + + @property + def text(self): + test_directory = os.path.dirname(__file__) + filename = os.path.join(test_directory, 'data', 'hamlet.xml') + with open(filename, "r") as f: + return f.read() + + +def test_xml_response_reader(monkeypatch): + monkeypatch.setattr(requests, "get", lambda x: MockResponse()) + reader = HamletXMLResponseReader() + docs = reader.documents() + + for doc, target in zip(docs, target_documents): + assert doc == target