fix openml-connector skips dataset issue (#377)

* fix openml-connector swiss-cheese indexing issue * making offset local variable. * removing multiple offset settings * partial corrections in connector unittest * correcting openml dataset connector unittests * corrections in unittest and format * Remove tests which test old wrong behavior of skiping assets --------- Co-authored-by: PGijsbers <[email protected]>
aiondemand · Oct 29, 2024 · a0df259 · a0df259
1 parent 3f57b2e
commit a0df259
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 43 deletions.
diff --git a/src/connectors/abstract/resource_connector_by_id.py b/src/connectors/abstract/resource_connector_by_id.py
@@ -35,25 +35,27 @@ def run(
             )
 
         first_run = not state
+        # Setting offset to zero, to avoid "swiss-cheese" indexing (Issue: #372).
+        # Further, offset is not required to persist in disk anymore.
+        offset = 0
+
         if first_run and from_identifier is None:
             raise ValueError("In the first run, the from-identifier needs to be set")
         elif first_run:
-            state["offset"] = 0
             state["from_id"] = from_identifier if from_identifier is not None else 0
         else:
             state["from_id"] = state["last_id"] + 1
-            state["offset"] = state["offset"]  # TODO: what if datasets are deleted? Or updated?
 
         logging.info(
             f"Starting synchronisation of records from id {state['from_id']} and"
-            f" offset {state['offset']}"
+            f" offset {offset}"
         )
 
         finished = False
         n_results = 0
         while not finished:
             i = 0
-            for item in self.fetch(offset=state["offset"], from_identifier=state["from_id"]):
+            for item in self.fetch(offset=offset, from_identifier=state["from_id"]):
                 if isinstance(item, RecordError) and isinstance(item.error, HTTPError):
                     yield item
                     return
@@ -77,5 +79,5 @@ def run(
 
             finished = i < self.limit_per_iteration
             logging.info(f"Finished: {i} < {self.limit_per_iteration}")
-            state["offset"] += i
+            offset += i
         state["result"] = "Complete run done (although there might be errors)."
diff --git a/src/tests/connectors/openml/test_openml_dataset_connector.py b/src/tests/connectors/openml/test_openml_dataset_connector.py
@@ -18,53 +18,48 @@ def test_first_run():
 
         datasets = list(connector.run(state, from_identifier=0, limit=None))
 
-    assert state["offset"] == 3, state
+    assert state["last_id"] == 4, state
     assert {d.name for d in datasets} == {"anneal", "labor", "kr-vs-kp"}
     assert len(datasets) == 3
     assert {len(d.citation) for d in datasets} == {0}
 
 
 def test_request_empty_list():
     """Tests if the state doesn't change after a request when OpenML returns an empty list."""
-    state = {"offset": 2, "last_id": 3}
+    state = {"last_id": 3}
     connector = OpenMlDatasetConnector(limit_per_iteration=2)
     with responses.RequestsMock() as mocked_requests:
         mocked_requests.add(
             responses.GET,
-            f"{OPENML_URL}/data/list/limit/2/offset/2",
+            f"{OPENML_URL}/data/list/limit/2/offset/0",
             json={"error": {"code": "372", "message": "No results"}},
             status=412,
         )
         datasets = list(connector.run(state, from_identifier=0, limit=None))
 
         assert len(datasets) == 1, datasets
         assert "No results" in datasets[0].error.args[0], datasets
-        assert state["offset"] == 2, state
         assert state["last_id"] == 3, state
 
 
 def test_second_run():
     connector = OpenMlDatasetConnector(limit_per_iteration=2)
-    with responses.RequestsMock() as mocked_requests:
-        mock_list_data(mocked_requests, offset=2)
-        mock_get_data(mocked_requests, "4")
-        datasets = list(
-            connector.run(state={"offset": 2, "last_id": 3}, from_identifier=0, limit=None)
-        )
-    assert len(datasets) == 1
-    assert {d.name for d in datasets} == {"labor"}
 
-
-def test_second_run_wrong_identifier():
-    connector = OpenMlDatasetConnector(limit_per_iteration=2)
+    state = {"last_id": 3}  # state from last run
     with responses.RequestsMock() as mocked_requests:
-        mock_list_data(mocked_requests, offset=2)
+        mock_list_data(
+            mocked_requests, offset=0
+        )  # Mock the first call to fetch datasets with offset=0
+        mock_list_data(
+            mocked_requests, offset=2
+        )  # Mock the second call for pagination with offset=2
         mock_get_data(mocked_requests, "4")
-        datasets = list(
-            connector.run(state={"offset": 2, "last_id": 1}, from_identifier=0, limit=None)
-        )
+        datasets = list(connector.run(state=state, from_identifier=1, limit=None))
+
     assert len(datasets) == 1
     assert {d.name for d in datasets} == {"labor"}
+    assert state["last_id"] == 4, state
+    assert state["from_id"] == 4, state  # state["from_id"] = state["last_id"] + 1
 
 
 def mock_list_data(mocked_requests, offset):

diff --git a/src/tests/connectors/openml/test_openml_mlmodel_connector.py b/src/tests/connectors/openml/test_openml_mlmodel_connector.py
@@ -18,7 +18,7 @@ def test_first_run():
             mock_get_data(mocked_requests, str(i))
         mlmodels = list(connector.run(state, from_identifier=0, limit=None))
 
-    assert state["offset"] == 3, state
+    assert state["last_id"] == 3, state
     assert {m.resource.name for m in mlmodels} == {
         "openml.evaluation.EuclideanDistance",
         "openml.evaluation.PolynomialKernel",
@@ -32,31 +32,34 @@ def test_first_run():
 
 def test_request_empty_list():
     """Tests if the state doesn't change after a request when OpenML returns an empty list."""
-    state = {"offset": 2, "last_id": 3}
+    state = {"last_id": 3}
     connector = OpenMlMLModelConnector(limit_per_iteration=2)
     with responses.RequestsMock() as mocked_requests:
         mocked_requests.add(
             responses.GET,
-            f"{OPENML_URL}/flow/list/limit/2/offset/2",
+            f"{OPENML_URL}/flow/list/limit/2/offset/0",
             json={"error": {"code": "500", "message": "No results"}},
             status=412,
         )
         ml_models = list(connector.run(state, from_identifier=0, limit=None))
 
         assert len(ml_models) == 1, ml_models
         assert "No results" in ml_models[0].error.args[0], ml_models
-        assert state["offset"] == 2, state
         assert state["last_id"] == 3, state
 
 
 def test_second_run():
     connector = OpenMlMLModelConnector(limit_per_iteration=2)
+
+    state = {"last_id": 2, "from_id": 0}
     with responses.RequestsMock() as mocked_requests:
+        mock_list_data(mocked_requests, offset=0)
         mock_list_data(mocked_requests, offset=2)
         mock_get_data(mocked_requests, "3")
-        mlmodels = list(
-            connector.run(state={"offset": 2, "last_id": 2}, from_identifier=0, limit=None)
-        )
+        mlmodels = list(connector.run(state=state, from_identifier=0, limit=None))
+    assert state["last_id"] == 3, state
+    assert state["from_id"] == 3, state
+
     assert len(mlmodels) == 1
     assert {m.resource.name for m in mlmodels} == {"openml.evaluation.RBFKernel"}
     mlmodel = mlmodels[0].resource
@@ -71,18 +74,6 @@ def test_second_run():
     assert mlmodels[0].related_resources["creator"][0].name == "Jan N. van Rijn"
 
 
-def test_second_run_wrong_identifier():
-    connector = OpenMlMLModelConnector(limit_per_iteration=2)
-    with responses.RequestsMock() as mocked_requests:
-        mock_list_data(mocked_requests, offset=2)
-        mock_get_data(mocked_requests, "3")
-        mlmodels = list(
-            connector.run(state={"offset": 2, "last_id": 0}, from_identifier=0, limit=None)
-        )
-    assert len(mlmodels) == 1
-    assert {m.resource.name for m in mlmodels} == {"openml.evaluation.RBFKernel"}
-
-
 def mock_list_data(mocked_requests, offset):
     """
     Mocking requests to the OpenML dependency, so that we test only our own services