From ede4185ee698d404922add1e14e80d9a43f08570 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 16 Jan 2025 12:19:45 +0100 Subject: [PATCH] implement required fields in core --- ianalyzer_readers/readers/core.py | 27 ++++++++++++++++++++++++--- ianalyzer_readers/readers/xml.py | 6 +----- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index b8d69f9..d3881bd 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -139,6 +139,15 @@ def fieldnames(self) -> List[str]: ''' return [field.name for field in self.fields] + + @property + def _required_field_names(self) -> List[str]: + ''' + A list of the names of all required fields + ''' + return [field.name for field in self.fields if field.required] + + def sources(self, **kwargs) -> Iterable[Source]: ''' Obtain source files for the Reader. @@ -176,10 +185,12 @@ def source2dicts(self, source: Source) -> Iterable[Document]: if isinstance(data, AbstractContextManager): with data as data: for document in self.iterate_data(data, metadata): - yield document + if self._has_required_fields(document): + yield document else: - for doc in self.iterate_data(data, metadata): - yield doc + for document in self.iterate_data(data, metadata): + if self._has_required_fields(document): + yield document def data_and_metadata_from_source(self, source: Source) -> Tuple[Any, Dict]: @@ -361,3 +372,13 @@ def _reject_extractors(self, *inapplicable_extractors: extract.Extractor): if isinstance(field.extractor, inapplicable_extractors): raise RuntimeError( "Specified extractor method cannot be used with this type of data") + + def _has_required_fields(self, document: Document) -> Iterable[Document]: + ''' + Check whether a document has a value for all fields marked as required. + ''' + + has_field = lambda field_name: document.get(field_name, None) is not None + return all( + has_field(field_name) for field_name in self._required_field_names + ) diff --git a/ianalyzer_readers/readers/xml.py b/ianalyzer_readers/readers/xml.py index e817a3f..a96757b 100644 --- a/ianalyzer_readers/readers/xml.py +++ b/ianalyzer_readers/readers/xml.py @@ -96,9 +96,6 @@ def iterate_data(self, data: bs4.BeautifulSoup, metadata: Dict) -> Iterable[Docu ) external_soup = None - required_fields = [ - field.name for field in self.fields if field.required] - # iterate through entries top_tag = resolve_tag_specification(self.__class__.tag_toplevel, metadata) bowl = top_tag.find_next_in_soup(data) @@ -129,8 +126,7 @@ def iterate_data(self, data: bs4.BeautifulSoup, metadata: Dict) -> Iterable[Docu # yield the union of external fields and document fields field_dict.update(external_dict) - if all(field_name in field_dict for field_name in required_fields): - yield field_dict + yield field_dict else: logger.warning('Top-level tag not found')