Skip to content

Commit

Permalink
Empty produces leading into list index out of range (#924)
Browse files Browse the repository at this point in the history
When no produces is defined the location of the dataset couldn't be
determined. Leading into list index out of range error.
Using the dataset index to determine the dataset location.
  • Loading branch information
mrchtr authored Apr 8, 2024
1 parent a1aa2fe commit 92ef1fe
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 19 deletions.
8 changes: 4 additions & 4 deletions src/fondant/component/data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,11 @@ def validate_dataframe_columns(dataframe: dd.DataFrame, columns: t.List[str]):

def _write_dataframe(self, dataframe: dd.DataFrame) -> None:
"""Create dataframe writing task."""
location = self.manifest.get_dataset_columns_locations(
columns=dataframe.columns,
)
output_location_path = self.manifest.index.location

output_location_path = location[0]
if not output_location_path:
msg = "No output location determined. Can not export the dataset."
raise ValueError(msg)

# Create directory the dataframe will be written to, since this is not handled by Pandas
# `to_parquet` method.
Expand Down
15 changes: 0 additions & 15 deletions src/fondant/core/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,21 +141,6 @@ def to_file(self, path: t.Union[str, Path]) -> None:
def manifest_location(self):
return self._specification["metadata"]["manifest_location"]

def get_dataset_columns_locations(
self,
columns: t.List[str],
) -> t.List[str]:
"""Select the fields which matching the column names and return their locations."""
relevant_fields = [
field for _, field in self.fields.items() if field.name in columns
]

return [
field.location
for field in relevant_fields
if isinstance(field, Field) and field.location
]

def copy(self) -> "Manifest":
"""Return a deep copy of itself."""
return self.__class__(copy.deepcopy(self._specification))
Expand Down
6 changes: 6 additions & 0 deletions src/fondant/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,12 @@ def __init__(
consumes = self._infer_consumes(component_spec, dataset_fields)
consumes = self._validate_consumes(consumes, component_spec, dataset_fields)

if produces is None and component_spec.produces_additional_properties:
logger.warning(
"Can not infer produces. "
"The component will not produce any new columns.",
)

self.operation_spec = OperationSpec(
self.component_spec,
consumes=consumes,
Expand Down
15 changes: 15 additions & 0 deletions tests/core/examples/evolution_examples/7/component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Example component 1
description: This is an example component
image: example_component_1:latest

consumes:
images_data:
type: binary

produces:
additionalProperties: true

args:
storage_args:
description: Storage arguments
type: str
33 changes: 33 additions & 0 deletions tests/core/examples/evolution_examples/7/output_manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"metadata": {
"dataset_name": "test_dataset",
"manifest_location": "gs://bucket/dataset",
"run_id": "custom_run_id",
"component_id": "example_component_1"
},
"index": {
"location": "gs://bucket/dataset/test_dataset/custom_run_id/example_component_1"
},
"fields": {
"images_width": {
"type": "int32",
"location": "gs://bucket/dataset/custom_run_id/example_component"
},
"images_height": {
"type": "int32",
"location": "gs://bucket/dataset/custom_run_id/example_component"
},
"images_data": {
"location": "gs://bucket/dataset/custom_run_id/example_component",
"type": "binary"
},
"text_string": {
"type": "string",
"location": "gs://bucket/dataset/test_dataset/custom_run_id/example_component_1"
},
"captions_data": {
"type": "binary",
"location": "gs://bucket/dataset/custom_run_id/example_component"
}
}
}
5 changes: 5 additions & 0 deletions tests/core/test_manifest_evolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
"text_data": "text_string",
},
},
"7": {
"produces": {
"text_string": pa.string(),
},
},
}

INVALID_EXAMPLES = {
Expand Down
25 changes: 25 additions & 0 deletions tests/pipeline/test_lightweight_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,3 +596,28 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
},
},
}


def test_warning_is_logged_when_produces_is_not_defined(caplog):
@lightweight_component
class CreateData(DaskLoadComponent):
def load(self) -> dd.DataFrame:
df = pd.DataFrame(
{
"x": [1, 2, 3],
"y": [4, 5, 6],
"z": [7, 8, 9],
},
index=pd.Index(["a", "b", "c"], name="id"),
)
return dd.from_pandas(df, npartitions=1)

_ = Dataset.create(
ref=CreateData,
dataset_name="dummy-dataset",
)

assert (
"Can not infer produces. The component will not produce any new columns"
in caplog.text
)

0 comments on commit 92ef1fe

Please sign in to comment.