feat: updated parsing, parsing status and post-parse clean

zimmerman-team · Feb 12, 2024 · bdfacda · bdfacda
1 parent bbaa9a0
commit bdfacda
Show file tree

Hide file tree

Showing 8 changed files with 30 additions and 26 deletions.
diff --git a/.coverage b/.coverage
diff --git a/direct_indexing/cleaning/dataset.py b/direct_indexing/cleaning/dataset.py
@@ -17,6 +17,9 @@ def recursive_attribute_cleaning(data):
         data = {key.replace('@', ''): item for key, item in data.items()}
         # Remove the lang xml tag
         data = {key.replace(XML_LANG_STR_STRIPPED, LANG_STR): item for key, item in data.items()}
+        data = {key: item for key, item in data.items() if '._' not in key}
+        data = {key: item for key, item in data.items() if 'http' not in key}
+
         # A list of fields that need to be appended to the dataset
         add_fields = {}
         for key, value in data.items():

diff --git a/direct_indexing/custom_fields/indexing_manytomany_relations.py b/direct_indexing/custom_fields/indexing_manytomany_relations.py
@@ -121,7 +121,6 @@ def iterate_third_level_children(child, data, field, target, target_field, total
             field_index = -1
         elif child in item:
             field_index = total_field
-            print(item[child])
             if not isinstance(item[child], list):
                 total_field += 1
             else:

diff --git a/direct_indexing/metadata/dataset.py b/direct_indexing/metadata/dataset.py
@@ -24,7 +24,8 @@ def subtask_process_dataset(dataset, update):
     elif should_retry:
         raise subtask_process_dataset.retry(countdown=60, max_retries=2, exc=DatasetException(message=f'Error indexing dataset {dataset["id"]}\nDataset metadata:\n{result}\nDataset indexing:\n{str(dataset_indexing_result)}'))  # NOQA
     else:
-        raise DatasetException(message=f'Error indexing dataset {dataset["id"]}\nDataset metadata:\n{result}\nDataset indexing:\n{str(dataset_indexing_result)}')  # NOQA
+        return "Dataset was not indexed"
+        # commented to prevent false positive exceptions. raise DatasetException(message=f'Error indexing dataset {dataset["id"]}\nDataset metadata:\n{result}\nDataset indexing:\n{str(dataset_indexing_result)}')  # NOQA
 
 
 def index_datasets_and_dataset_metadata(update, force_update):

diff --git a/direct_indexing/processing/dataset.py b/direct_indexing/processing/dataset.py
@@ -8,7 +8,7 @@
 from pysolr import Solr
 from xmljson import badgerfish as bf
 
-from direct_indexing.cleaning.dataset import recursive_attribute_cleaning
+from direct_indexing.cleaning.dataset import recursive_attribute_cleaning#, broken_dataset
 from direct_indexing.cleaning.metadata import clean_dataset_metadata
 from direct_indexing.custom_fields import custom_fields, organisation_custom_fields
 from direct_indexing.custom_fields.models import codelists
@@ -46,7 +46,7 @@ def fun(dataset, update=False):
     # Validate the relevant files, mark others as invalid
     validation_status = 'Valid'
     should_be_indexed = False
-    if valid_version and 'dataset.extras.validation_status' in dataset_metadata:
+    if 'dataset.extras.validation_status' in dataset_metadata:
         validation_status = 'Invalid' if dataset_metadata['dataset.extras.validation_status'] == 'Critical' else 'Valid'
 
     # Add the validation status to the dataset
@@ -61,14 +61,15 @@ def fun(dataset, update=False):
 
     # Index the relevant datasets,
     # these are activity files of a valid version and that have been successfully validated (not critical)
-    if validation_status == 'Valid':
+    if validation_status == 'Valid' and valid_version:
         indexed, dataset_indexing_result, should_be_indexed = index_dataset(dataset_filepath, dataset_filetype,
                                                                             codelist, currencies, dataset_metadata)
     # Add an indexing status to the dataset metadata.
     dataset['iati_cloud_indexed'] = indexed
     dataset['iati_cloud_indexed_datetime'] = str(datetime.now())
     dataset['iati_cloud_should_be_indexed'] = should_be_indexed
-
+    if not indexed:
+        dataset['iati_cloud_removed_reason'] = dataset_indexing_result
     # Index the dataset metadata
     logging.info('-- Save the dataset metadata')
     result = index(
@@ -96,16 +97,16 @@ def index_dataset(internal_url, dataset_filetype, codelist, currencies, dataset_
     try:
         core_url = settings.SOLR_ACTIVITY_URL if dataset_filetype == 'activity' else settings.SOLR_ORGANISATION_URL
         logging.info("-- Get JSON path")
-        json_path, should_be_indexed = convert_and_save_xml_to_processed_json(internal_url, dataset_filetype,
-                                                                              codelist, currencies, dataset_metadata)
+        json_path, should_be_indexed, p_res = convert_and_save_xml_to_processed_json(internal_url, dataset_filetype,
+                                                                                     codelist, currencies, dataset_metadata)
         if json_path:
             logging.info("-- INDEXING JSON PATH")
             result = index_to_core(core_url, json_path, remove=True)
             logging.info(f'result of indexing {result}')
             if result == 'Successfully indexed':
                 return True, result, should_be_indexed
-            return False, result, should_be_indexed
-        return False, "No JSON Path found", should_be_indexed
+            return False, "Unable to index the processed dataset.", False
+        return False, p_res, should_be_indexed
     except Exception as e:  # NOQA
         logging.warning(f'Exception occurred while indexing {dataset_filetype} dataset:\n{internal_url}\n{e}\nTherefore the dataset will not be indexed.')  # NOQA
         return False, str(e), should_be_indexed
@@ -130,15 +131,15 @@ def convert_and_save_xml_to_processed_json(filepath, filetype, codelist, currenc
         tree = ET.tostring(etree.getroot())
     except ET.ParseError:
         logging.info(f'-- Error parsing {filepath}')
-        return None, should_be_indexed
+        return None, should_be_indexed, "Unable to read XML file."
     # Convert the tree to json using BadgerFish method.
     data = bf.data(ET.fromstring(tree))
     # Retrieve activities
     data, data_found = extract_activity_or_organisation_data(filetype, data)
 
     if not data_found:
         logging.info(f'-- No data found in {filepath}')
-        return data_found, should_be_indexed
+        return data_found, should_be_indexed, "Data was not present in the data dump."
     # Clean the dataset
     data = recursive_attribute_cleaning(data)
 
@@ -147,24 +148,24 @@ def convert_and_save_xml_to_processed_json(filepath, filetype, codelist, currenc
         data = custom_fields.add_all(data, codelist, currencies, dataset_metadata)
     if filetype == 'organisation':
         data = organisation_custom_fields.add_all(data)
-
+    
     json_path = json_filepath(filepath)
     if not json_path:
         logging.info(f'-- Error creating json path for {filepath}')
-        return False, should_be_indexed
+        return False, should_be_indexed, "A JSON path could not be created for the dataset."
     should_be_indexed = True
     logging.info(f'-- Saving to {json_path}')
     try:
         with open(json_path, 'w') as json_file:
             json.dump(data, json_file)
     except Exception:
         logging.info(f'-- Error saving to {json_path}, failed')
-        return False, should_be_indexed
+        return False, should_be_indexed, "Processed data could not be saved as JSON."
 
     if not settings.FCDO_INSTANCE:
         dataset_subtypes(filetype, data, json_path)
 
-    return json_path, should_be_indexed
+    return json_path, should_be_indexed, "Success"
 
 
 def extract_activity_or_organisation_data(filetype, data):

diff --git a/tests/direct_indexing/metadata/test_dataset.py b/tests/direct_indexing/metadata/test_dataset.py
@@ -33,9 +33,10 @@ def test_subtask_process_dataset(mocker, fixture_dataset):
     # Test DatasetException
     res_str_err = 'Error processing dataset'
     mocker.patch(fun_path, return_value=(res_str_err, res_str, False))
-    with pytest.raises(DatasetException) as excinfo:
-        subtask_process_dataset(fixture_dataset, False)
-    assert str(excinfo.value) == f'Error indexing dataset {fixture_dataset["id"]}\nDataset metadata:\n{res_str}\nDataset indexing:\n{str(res_str_err)}'  # NOQA
+    assert subtask_process_dataset(fixture_dataset, False) == "Dataset was not indexed"
+    # with pytest.raises(DatasetException) as excinfo:
+        # subtask_process_dataset(fixture_dataset, False)
+    # assert str(excinfo.value) == f'Error indexing dataset {fixture_dataset["id"]}\nDataset metadata:\n{res_str}\nDataset indexing:\n{str(res_str_err)}'  # NOQA
 
     # Test retry
     mocker.patch(fun_path, return_value=(res_str_err, res_str, True))

diff --git a/tests/direct_indexing/metadata/test_publisher.py b/tests/direct_indexing/metadata/test_publisher.py
@@ -28,7 +28,6 @@ def test__preprocess_publisher_metadata(publisher_metadata_sample):
     assert processed[2]['publisher_first_publish_date'] == TEST_VAL_1_NEW
     # Assert other fields are not changed
     assert processed[0]['other'] == TEST_VAL_4
-    print(processed)
 
 
 @pytest.fixture

diff --git a/tests/direct_indexing/processing/test_dataset.py b/tests/direct_indexing/processing/test_dataset.py
@@ -61,18 +61,18 @@ def test_fun(mocker):
 def test_index_dataset(mocker):
     convert_save = 'direct_indexing.processing.dataset.convert_and_save_xml_to_processed_json'
     # mock convert_and_save_xml_to_processed_json, index_to_core
-    mock_convert = mocker.patch(convert_save, return_value=(False, False))  # NOQA: 501
+    mock_convert = mocker.patch(convert_save, return_value=(False, False, "No JSON Path found"))  # NOQA: 501
     mock_index = mocker.patch('direct_indexing.processing.dataset.index_to_core', return_value=INDEX_SUCCESS)
     assert index_dataset(None, None, None, None, None) == (False, 'No JSON Path found', False)
     mock_convert.assert_called_once()
     mock_index.assert_not_called()
 
-    mock_convert.return_value = (TEST_PATH, True)
+    mock_convert.return_value = (TEST_PATH, True, "Successfully Indexed")
     assert index_dataset(None, None, None, None, None) == (True, INDEX_SUCCESS, True)
     mock_index.assert_called_once()
 
-    mock_index.return_value = 'Failed to index'
-    assert index_dataset(None, None, None, None, None) == (False, 'Failed to index', True)
+    mock_index.return_value = 'Unable to index the processed dataset.'
+    assert index_dataset(None, None, None, None, None) == (False, 'Unable to index the processed dataset.', False)
 
     # Test that if index_dataset raises an exception with error message 'test', it returns a tuple False, 'test'
     mocker.patch(convert_save, side_effect=Exception('test'))  # NOQA: 501
@@ -140,11 +140,11 @@ def test_convert_and_save_xml_to_processed_json(mocker, tmp_path, fixture_xml_ac
     # Assert if json.dump raises an exception, the return value is None
     mocker.patch('json.dump', side_effect=Exception)
     mock_json_filepath.return_value = str(tmp_path / TEST_JSON)
-    assert convert_and_save_xml_to_processed_json(xml_path, 'organisation', None, None, None) == (False, True)
+    assert convert_and_save_xml_to_processed_json(xml_path, 'organisation', None, None, None) == (False, True, "Processed data could not be saved as JSON.")
 
     # Assert if ET raises a ParseError, the return value is None
     mocker.patch('xml.etree.ElementTree.parse', side_effect=ET.ParseError)
-    assert convert_and_save_xml_to_processed_json(None, None, None, None, None) == (None, False)
+    assert convert_and_save_xml_to_processed_json(None, None, None, None, None) == (None, False, "Unable to read XML file.")
 
 
 def test_json_filepath(mocker):