Skip to content

Commit

Permalink
feat: updated parsing, parsing status and post-parse clean
Browse files Browse the repository at this point in the history
  • Loading branch information
sylvanr committed Feb 12, 2024
1 parent bbaa9a0 commit bdfacda
Show file tree
Hide file tree
Showing 8 changed files with 30 additions and 26 deletions.
Binary file modified .coverage
Binary file not shown.
3 changes: 3 additions & 0 deletions direct_indexing/cleaning/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def recursive_attribute_cleaning(data):
data = {key.replace('@', ''): item for key, item in data.items()}
# Remove the lang xml tag
data = {key.replace(XML_LANG_STR_STRIPPED, LANG_STR): item for key, item in data.items()}
data = {key: item for key, item in data.items() if '._' not in key}
data = {key: item for key, item in data.items() if 'http' not in key}

# A list of fields that need to be appended to the dataset
add_fields = {}
for key, value in data.items():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def iterate_third_level_children(child, data, field, target, target_field, total
field_index = -1
elif child in item:
field_index = total_field
print(item[child])
if not isinstance(item[child], list):
total_field += 1
else:
Expand Down
3 changes: 2 additions & 1 deletion direct_indexing/metadata/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def subtask_process_dataset(dataset, update):
elif should_retry:
raise subtask_process_dataset.retry(countdown=60, max_retries=2, exc=DatasetException(message=f'Error indexing dataset {dataset["id"]}\nDataset metadata:\n{result}\nDataset indexing:\n{str(dataset_indexing_result)}')) # NOQA
else:
raise DatasetException(message=f'Error indexing dataset {dataset["id"]}\nDataset metadata:\n{result}\nDataset indexing:\n{str(dataset_indexing_result)}') # NOQA
return "Dataset was not indexed"
# commented to prevent false positive exceptions. raise DatasetException(message=f'Error indexing dataset {dataset["id"]}\nDataset metadata:\n{result}\nDataset indexing:\n{str(dataset_indexing_result)}') # NOQA


def index_datasets_and_dataset_metadata(update, force_update):
Expand Down
29 changes: 15 additions & 14 deletions direct_indexing/processing/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pysolr import Solr
from xmljson import badgerfish as bf

from direct_indexing.cleaning.dataset import recursive_attribute_cleaning
from direct_indexing.cleaning.dataset import recursive_attribute_cleaning#, broken_dataset
from direct_indexing.cleaning.metadata import clean_dataset_metadata
from direct_indexing.custom_fields import custom_fields, organisation_custom_fields
from direct_indexing.custom_fields.models import codelists
Expand Down Expand Up @@ -46,7 +46,7 @@ def fun(dataset, update=False):
# Validate the relevant files, mark others as invalid
validation_status = 'Valid'
should_be_indexed = False
if valid_version and 'dataset.extras.validation_status' in dataset_metadata:
if 'dataset.extras.validation_status' in dataset_metadata:
validation_status = 'Invalid' if dataset_metadata['dataset.extras.validation_status'] == 'Critical' else 'Valid'

# Add the validation status to the dataset
Expand All @@ -61,14 +61,15 @@ def fun(dataset, update=False):

# Index the relevant datasets,
# these are activity files of a valid version and that have been successfully validated (not critical)
if validation_status == 'Valid':
if validation_status == 'Valid' and valid_version:
indexed, dataset_indexing_result, should_be_indexed = index_dataset(dataset_filepath, dataset_filetype,
codelist, currencies, dataset_metadata)
# Add an indexing status to the dataset metadata.
dataset['iati_cloud_indexed'] = indexed
dataset['iati_cloud_indexed_datetime'] = str(datetime.now())
dataset['iati_cloud_should_be_indexed'] = should_be_indexed

if not indexed:
dataset['iati_cloud_removed_reason'] = dataset_indexing_result
# Index the dataset metadata
logging.info('-- Save the dataset metadata')
result = index(
Expand Down Expand Up @@ -96,16 +97,16 @@ def index_dataset(internal_url, dataset_filetype, codelist, currencies, dataset_
try:
core_url = settings.SOLR_ACTIVITY_URL if dataset_filetype == 'activity' else settings.SOLR_ORGANISATION_URL
logging.info("-- Get JSON path")
json_path, should_be_indexed = convert_and_save_xml_to_processed_json(internal_url, dataset_filetype,
codelist, currencies, dataset_metadata)
json_path, should_be_indexed, p_res = convert_and_save_xml_to_processed_json(internal_url, dataset_filetype,
codelist, currencies, dataset_metadata)
if json_path:
logging.info("-- INDEXING JSON PATH")
result = index_to_core(core_url, json_path, remove=True)
logging.info(f'result of indexing {result}')
if result == 'Successfully indexed':
return True, result, should_be_indexed
return False, result, should_be_indexed
return False, "No JSON Path found", should_be_indexed
return False, "Unable to index the processed dataset.", False
return False, p_res, should_be_indexed
except Exception as e: # NOQA
logging.warning(f'Exception occurred while indexing {dataset_filetype} dataset:\n{internal_url}\n{e}\nTherefore the dataset will not be indexed.') # NOQA
return False, str(e), should_be_indexed
Expand All @@ -130,15 +131,15 @@ def convert_and_save_xml_to_processed_json(filepath, filetype, codelist, currenc
tree = ET.tostring(etree.getroot())
except ET.ParseError:
logging.info(f'-- Error parsing {filepath}')
return None, should_be_indexed
return None, should_be_indexed, "Unable to read XML file."
# Convert the tree to json using BadgerFish method.
data = bf.data(ET.fromstring(tree))
# Retrieve activities
data, data_found = extract_activity_or_organisation_data(filetype, data)

if not data_found:
logging.info(f'-- No data found in {filepath}')
return data_found, should_be_indexed
return data_found, should_be_indexed, "Data was not present in the data dump."
# Clean the dataset
data = recursive_attribute_cleaning(data)

Expand All @@ -147,24 +148,24 @@ def convert_and_save_xml_to_processed_json(filepath, filetype, codelist, currenc
data = custom_fields.add_all(data, codelist, currencies, dataset_metadata)
if filetype == 'organisation':
data = organisation_custom_fields.add_all(data)

json_path = json_filepath(filepath)
if not json_path:
logging.info(f'-- Error creating json path for {filepath}')
return False, should_be_indexed
return False, should_be_indexed, "A JSON path could not be created for the dataset."
should_be_indexed = True
logging.info(f'-- Saving to {json_path}')
try:
with open(json_path, 'w') as json_file:
json.dump(data, json_file)
except Exception:
logging.info(f'-- Error saving to {json_path}, failed')
return False, should_be_indexed
return False, should_be_indexed, "Processed data could not be saved as JSON."

if not settings.FCDO_INSTANCE:
dataset_subtypes(filetype, data, json_path)

return json_path, should_be_indexed
return json_path, should_be_indexed, "Success"


def extract_activity_or_organisation_data(filetype, data):
Expand Down
7 changes: 4 additions & 3 deletions tests/direct_indexing/metadata/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ def test_subtask_process_dataset(mocker, fixture_dataset):
# Test DatasetException
res_str_err = 'Error processing dataset'
mocker.patch(fun_path, return_value=(res_str_err, res_str, False))
with pytest.raises(DatasetException) as excinfo:
subtask_process_dataset(fixture_dataset, False)
assert str(excinfo.value) == f'Error indexing dataset {fixture_dataset["id"]}\nDataset metadata:\n{res_str}\nDataset indexing:\n{str(res_str_err)}' # NOQA
assert subtask_process_dataset(fixture_dataset, False) == "Dataset was not indexed"
# with pytest.raises(DatasetException) as excinfo:
# subtask_process_dataset(fixture_dataset, False)
# assert str(excinfo.value) == f'Error indexing dataset {fixture_dataset["id"]}\nDataset metadata:\n{res_str}\nDataset indexing:\n{str(res_str_err)}' # NOQA

# Test retry
mocker.patch(fun_path, return_value=(res_str_err, res_str, True))
Expand Down
1 change: 0 additions & 1 deletion tests/direct_indexing/metadata/test_publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def test__preprocess_publisher_metadata(publisher_metadata_sample):
assert processed[2]['publisher_first_publish_date'] == TEST_VAL_1_NEW
# Assert other fields are not changed
assert processed[0]['other'] == TEST_VAL_4
print(processed)


@pytest.fixture
Expand Down
12 changes: 6 additions & 6 deletions tests/direct_indexing/processing/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,18 +61,18 @@ def test_fun(mocker):
def test_index_dataset(mocker):
convert_save = 'direct_indexing.processing.dataset.convert_and_save_xml_to_processed_json'
# mock convert_and_save_xml_to_processed_json, index_to_core
mock_convert = mocker.patch(convert_save, return_value=(False, False)) # NOQA: 501
mock_convert = mocker.patch(convert_save, return_value=(False, False, "No JSON Path found")) # NOQA: 501
mock_index = mocker.patch('direct_indexing.processing.dataset.index_to_core', return_value=INDEX_SUCCESS)
assert index_dataset(None, None, None, None, None) == (False, 'No JSON Path found', False)
mock_convert.assert_called_once()
mock_index.assert_not_called()

mock_convert.return_value = (TEST_PATH, True)
mock_convert.return_value = (TEST_PATH, True, "Successfully Indexed")
assert index_dataset(None, None, None, None, None) == (True, INDEX_SUCCESS, True)
mock_index.assert_called_once()

mock_index.return_value = 'Failed to index'
assert index_dataset(None, None, None, None, None) == (False, 'Failed to index', True)
mock_index.return_value = 'Unable to index the processed dataset.'
assert index_dataset(None, None, None, None, None) == (False, 'Unable to index the processed dataset.', False)

# Test that if index_dataset raises an exception with error message 'test', it returns a tuple False, 'test'
mocker.patch(convert_save, side_effect=Exception('test')) # NOQA: 501
Expand Down Expand Up @@ -140,11 +140,11 @@ def test_convert_and_save_xml_to_processed_json(mocker, tmp_path, fixture_xml_ac
# Assert if json.dump raises an exception, the return value is None
mocker.patch('json.dump', side_effect=Exception)
mock_json_filepath.return_value = str(tmp_path / TEST_JSON)
assert convert_and_save_xml_to_processed_json(xml_path, 'organisation', None, None, None) == (False, True)
assert convert_and_save_xml_to_processed_json(xml_path, 'organisation', None, None, None) == (False, True, "Processed data could not be saved as JSON.")

# Assert if ET raises a ParseError, the return value is None
mocker.patch('xml.etree.ElementTree.parse', side_effect=ET.ParseError)
assert convert_and_save_xml_to_processed_json(None, None, None, None, None) == (None, False)
assert convert_and_save_xml_to_processed_json(None, None, None, None, None) == (None, False, "Unable to read XML file.")


def test_json_filepath(mocker):
Expand Down

0 comments on commit bdfacda

Please sign in to comment.