Skip to content

Commit

Permalink
fix openml-connector skips dataset issue (#377)
Browse files Browse the repository at this point in the history
* fix openml-connector swiss-cheese indexing issue

* making offset local variable.

* removing multiple offset settings

* partial corrections in connector unittest

* correcting openml dataset connector unittests

* corrections in unittest and format

* Remove tests which test old wrong behavior of skiping assets

---------

Co-authored-by: PGijsbers <[email protected]>
  • Loading branch information
Taniya-Das and PGijsbers authored Oct 29, 2024
1 parent 3f57b2e commit a0df259
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 43 deletions.
12 changes: 7 additions & 5 deletions src/connectors/abstract/resource_connector_by_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,27 @@ def run(
)

first_run = not state
# Setting offset to zero, to avoid "swiss-cheese" indexing (Issue: #372).
# Further, offset is not required to persist in disk anymore.
offset = 0

if first_run and from_identifier is None:
raise ValueError("In the first run, the from-identifier needs to be set")
elif first_run:
state["offset"] = 0
state["from_id"] = from_identifier if from_identifier is not None else 0
else:
state["from_id"] = state["last_id"] + 1
state["offset"] = state["offset"] # TODO: what if datasets are deleted? Or updated?

logging.info(
f"Starting synchronisation of records from id {state['from_id']} and"
f" offset {state['offset']}"
f" offset {offset}"
)

finished = False
n_results = 0
while not finished:
i = 0
for item in self.fetch(offset=state["offset"], from_identifier=state["from_id"]):
for item in self.fetch(offset=offset, from_identifier=state["from_id"]):
if isinstance(item, RecordError) and isinstance(item.error, HTTPError):
yield item
return
Expand All @@ -77,5 +79,5 @@ def run(

finished = i < self.limit_per_iteration
logging.info(f"Finished: {i} < {self.limit_per_iteration}")
state["offset"] += i
offset += i
state["result"] = "Complete run done (although there might be errors)."
33 changes: 14 additions & 19 deletions src/tests/connectors/openml/test_openml_dataset_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,53 +18,48 @@ def test_first_run():

datasets = list(connector.run(state, from_identifier=0, limit=None))

assert state["offset"] == 3, state
assert state["last_id"] == 4, state
assert {d.name for d in datasets} == {"anneal", "labor", "kr-vs-kp"}
assert len(datasets) == 3
assert {len(d.citation) for d in datasets} == {0}


def test_request_empty_list():
"""Tests if the state doesn't change after a request when OpenML returns an empty list."""
state = {"offset": 2, "last_id": 3}
state = {"last_id": 3}
connector = OpenMlDatasetConnector(limit_per_iteration=2)
with responses.RequestsMock() as mocked_requests:
mocked_requests.add(
responses.GET,
f"{OPENML_URL}/data/list/limit/2/offset/2",
f"{OPENML_URL}/data/list/limit/2/offset/0",
json={"error": {"code": "372", "message": "No results"}},
status=412,
)
datasets = list(connector.run(state, from_identifier=0, limit=None))

assert len(datasets) == 1, datasets
assert "No results" in datasets[0].error.args[0], datasets
assert state["offset"] == 2, state
assert state["last_id"] == 3, state


def test_second_run():
connector = OpenMlDatasetConnector(limit_per_iteration=2)
with responses.RequestsMock() as mocked_requests:
mock_list_data(mocked_requests, offset=2)
mock_get_data(mocked_requests, "4")
datasets = list(
connector.run(state={"offset": 2, "last_id": 3}, from_identifier=0, limit=None)
)
assert len(datasets) == 1
assert {d.name for d in datasets} == {"labor"}


def test_second_run_wrong_identifier():
connector = OpenMlDatasetConnector(limit_per_iteration=2)
state = {"last_id": 3} # state from last run
with responses.RequestsMock() as mocked_requests:
mock_list_data(mocked_requests, offset=2)
mock_list_data(
mocked_requests, offset=0
) # Mock the first call to fetch datasets with offset=0
mock_list_data(
mocked_requests, offset=2
) # Mock the second call for pagination with offset=2
mock_get_data(mocked_requests, "4")
datasets = list(
connector.run(state={"offset": 2, "last_id": 1}, from_identifier=0, limit=None)
)
datasets = list(connector.run(state=state, from_identifier=1, limit=None))

assert len(datasets) == 1
assert {d.name for d in datasets} == {"labor"}
assert state["last_id"] == 4, state
assert state["from_id"] == 4, state # state["from_id"] = state["last_id"] + 1


def mock_list_data(mocked_requests, offset):
Expand Down
29 changes: 10 additions & 19 deletions src/tests/connectors/openml/test_openml_mlmodel_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_first_run():
mock_get_data(mocked_requests, str(i))
mlmodels = list(connector.run(state, from_identifier=0, limit=None))

assert state["offset"] == 3, state
assert state["last_id"] == 3, state
assert {m.resource.name for m in mlmodels} == {
"openml.evaluation.EuclideanDistance",
"openml.evaluation.PolynomialKernel",
Expand All @@ -32,31 +32,34 @@ def test_first_run():

def test_request_empty_list():
"""Tests if the state doesn't change after a request when OpenML returns an empty list."""
state = {"offset": 2, "last_id": 3}
state = {"last_id": 3}
connector = OpenMlMLModelConnector(limit_per_iteration=2)
with responses.RequestsMock() as mocked_requests:
mocked_requests.add(
responses.GET,
f"{OPENML_URL}/flow/list/limit/2/offset/2",
f"{OPENML_URL}/flow/list/limit/2/offset/0",
json={"error": {"code": "500", "message": "No results"}},
status=412,
)
ml_models = list(connector.run(state, from_identifier=0, limit=None))

assert len(ml_models) == 1, ml_models
assert "No results" in ml_models[0].error.args[0], ml_models
assert state["offset"] == 2, state
assert state["last_id"] == 3, state


def test_second_run():
connector = OpenMlMLModelConnector(limit_per_iteration=2)

state = {"last_id": 2, "from_id": 0}
with responses.RequestsMock() as mocked_requests:
mock_list_data(mocked_requests, offset=0)
mock_list_data(mocked_requests, offset=2)
mock_get_data(mocked_requests, "3")
mlmodels = list(
connector.run(state={"offset": 2, "last_id": 2}, from_identifier=0, limit=None)
)
mlmodels = list(connector.run(state=state, from_identifier=0, limit=None))
assert state["last_id"] == 3, state
assert state["from_id"] == 3, state

assert len(mlmodels) == 1
assert {m.resource.name for m in mlmodels} == {"openml.evaluation.RBFKernel"}
mlmodel = mlmodels[0].resource
Expand All @@ -71,18 +74,6 @@ def test_second_run():
assert mlmodels[0].related_resources["creator"][0].name == "Jan N. van Rijn"


def test_second_run_wrong_identifier():
connector = OpenMlMLModelConnector(limit_per_iteration=2)
with responses.RequestsMock() as mocked_requests:
mock_list_data(mocked_requests, offset=2)
mock_get_data(mocked_requests, "3")
mlmodels = list(
connector.run(state={"offset": 2, "last_id": 0}, from_identifier=0, limit=None)
)
assert len(mlmodels) == 1
assert {m.resource.name for m in mlmodels} == {"openml.evaluation.RBFKernel"}


def mock_list_data(mocked_requests, offset):
"""
Mocking requests to the OpenML dependency, so that we test only our own services
Expand Down

0 comments on commit a0df259

Please sign in to comment.