Skip to content

Commit

Permalink
implement required fields in core
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas committed Jan 16, 2025
1 parent 8a7feba commit ede4185
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 8 deletions.
27 changes: 24 additions & 3 deletions ianalyzer_readers/readers/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,15 @@ def fieldnames(self) -> List[str]:
'''
return [field.name for field in self.fields]


@property
def _required_field_names(self) -> List[str]:
'''
A list of the names of all required fields
'''
return [field.name for field in self.fields if field.required]


def sources(self, **kwargs) -> Iterable[Source]:
'''
Obtain source files for the Reader.
Expand Down Expand Up @@ -176,10 +185,12 @@ def source2dicts(self, source: Source) -> Iterable[Document]:
if isinstance(data, AbstractContextManager):
with data as data:
for document in self.iterate_data(data, metadata):
yield document
if self._has_required_fields(document):
yield document
else:
for doc in self.iterate_data(data, metadata):
yield doc
for document in self.iterate_data(data, metadata):
if self._has_required_fields(document):
yield document


def data_and_metadata_from_source(self, source: Source) -> Tuple[Any, Dict]:
Expand Down Expand Up @@ -361,3 +372,13 @@ def _reject_extractors(self, *inapplicable_extractors: extract.Extractor):
if isinstance(field.extractor, inapplicable_extractors):
raise RuntimeError(
"Specified extractor method cannot be used with this type of data")

def _has_required_fields(self, document: Document) -> Iterable[Document]:
'''
Check whether a document has a value for all fields marked as required.
'''

has_field = lambda field_name: document.get(field_name, None) is not None
return all(
has_field(field_name) for field_name in self._required_field_names
)
6 changes: 1 addition & 5 deletions ianalyzer_readers/readers/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,6 @@ def iterate_data(self, data: bs4.BeautifulSoup, metadata: Dict) -> Iterable[Docu
)
external_soup = None

required_fields = [
field.name for field in self.fields if field.required]

# iterate through entries
top_tag = resolve_tag_specification(self.__class__.tag_toplevel, metadata)
bowl = top_tag.find_next_in_soup(data)
Expand Down Expand Up @@ -129,8 +126,7 @@ def iterate_data(self, data: bs4.BeautifulSoup, metadata: Dict) -> Iterable[Docu

# yield the union of external fields and document fields
field_dict.update(external_dict)
if all(field_name in field_dict for field_name in required_fields):
yield field_dict
yield field_dict
else:
logger.warning('Top-level tag not found')

Expand Down

0 comments on commit ede4185

Please sign in to comment.