Skip to content

Commit

Permalink
refactor(breadbox): Remove catalog node aliases (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
snwessel authored Jul 23, 2024
1 parent c7f9496 commit 1418f65
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 170 deletions.
8 changes: 4 additions & 4 deletions breadbox/breadbox/compute/analysis_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,8 +440,8 @@ def create_cell_line_group(
depmap_model_sample_type = get_dimension_type(db, "depmap_model")

(
feature_labels_and_aliases,
sample_labels_and_aliases,
feature_given_id_and_index_df,
sample_given_id_and_index_df,
feature_warnings,
sample_warnings,
) = dataclasses.astuple(
Expand Down Expand Up @@ -481,8 +481,8 @@ def create_cell_line_group(
db,
user,
dataset_in,
feature_labels_and_aliases,
sample_labels_and_aliases,
feature_given_id_and_index_df,
sample_given_id_and_index_df,
generic_feature_type,
depmap_model_sample_type,
)
Expand Down
44 changes: 4 additions & 40 deletions breadbox/breadbox/compute/dataset_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,26 +60,6 @@ def get_file_dict(datafile) -> FileDict:
return data_file_dict


def _get_transient_dataset_feature_cat_node_id(db, user, added_dataset) -> int:
feature_cat_nodes = (
db.query(CatalogNode)
.join(DatasetFeature, DatasetFeature.id == CatalogNode.dimension_id)
.filter(
and_(
CatalogNode.dataset_id == added_dataset.id,
CatalogNode.dimension_id.isnot(None),
)
)
.all()
)
if len(feature_cat_nodes) != 1:
raise HTTPException(
400,
f"Unexpected number of features for a transient dataset. Expected 1, found {len(feature_cat_nodes)}.",
)
return feature_cat_nodes[0].id


# Convert the FileDict back to an UploadFile after it is passed through redis.
# This is a general function used for data_file
def _get_upload_file_from_file_dict(file_dict: FileDict) -> UploadFile:
Expand Down Expand Up @@ -238,8 +218,8 @@ def upload_dataset(

try:
(
feature_labels_and_aliases,
sample_labels_and_aliases,
feature_given_id_and_index_df,
sample_given_id_and_index_df,
feature_warnings,
sample_warnings,
) = dataclasses.astuple(
Expand Down Expand Up @@ -280,8 +260,8 @@ def upload_dataset(
db,
user,
dataset,
feature_labels_and_aliases,
sample_labels_and_aliases,
feature_given_id_and_index_df,
sample_given_id_and_index_df,
valid_fields.valid_feature_type,
valid_fields.valid_sample_type,
)
Expand Down Expand Up @@ -310,21 +290,6 @@ def upload_dataset(
warning = feature_warning + sample_warning

forwardingUrl = None
slice_id = ""
if is_transient:
feature_cat_node_id = _get_transient_dataset_feature_cat_node_id(
db, user, added_dataset
)
slice_id = str(feature_cat_node_id)

from urllib.parse import urlencode

params = {
"x": feature_cat_node_id,
"y": feature_cat_node_id,
"defaultCustomAnalysisToX": True,
}
forwardingUrl = "/elara?" + urlencode(params)

if update_message:
update_message("Wrapping up upload...")
Expand All @@ -336,7 +301,6 @@ def upload_dataset(
datasetId=str(added_dataset.id),
warnings=[warning],
forwardingUrl=forwardingUrl,
sliceId=slice_id,
)


Expand Down
52 changes: 4 additions & 48 deletions breadbox/breadbox/crud/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ def add_matrix_dataset(
db: SessionWithUser,
user: str,
dataset_in: MatrixDatasetIn,
feature_labels_and_aliases: pd.DataFrame,
sample_labels_and_aliases: pd.DataFrame,
feature_given_id_and_index_df: pd.DataFrame,
sample_given_id_and_index_df: pd.DataFrame,
feature_type: Optional[DimensionType],
sample_type: DimensionType,
):
Expand Down Expand Up @@ -243,18 +243,16 @@ def is_binary_category(allowed_values_list):

_add_matrix_dataset_dimensions(
db=db,
index_and_given_id=feature_labels_and_aliases,
index_and_given_id=feature_given_id_and_index_df,
dimension_subtype_cls=DatasetFeature,
dimension_type_name=feature_type.name if feature_type else None,
dataset_catalog_node=dataset_catalog_node,
dataset=dataset,
)
_add_matrix_dataset_dimensions(
db=db,
index_and_given_id=sample_labels_and_aliases,
index_and_given_id=sample_given_id_and_index_df,
dimension_subtype_cls=DatasetSample,
dimension_type_name=sample_type.name,
dataset_catalog_node=dataset_catalog_node,
dataset=dataset,
)

Expand Down Expand Up @@ -656,7 +654,6 @@ def _add_matrix_dataset_dimensions(
index_and_given_id: pd.DataFrame,
dimension_subtype_cls: Union[Type[DatasetFeature], Type[DatasetSample]],
dimension_type_name: Optional[str],
dataset_catalog_node: CatalogNode,
dataset: MatrixDataset,
):
dimensions: List[Union[DatasetFeature, DatasetSample]] = []
Expand All @@ -681,19 +678,6 @@ def _add_matrix_dataset_dimensions(
)
)

# `given_id` is being passed in here for the label of the catalog node. This is technically not how
# catalog nodes were originally intended to work, but:
# 1. we should be eliminating catalog nodes in the future
# 2. catalog nodes no longer have any UI impact
# 3. removing catalog nodes at this time would require a decent amount of refactoring
# This allows us to keep catalog nodes until we're ready to remove them, while at the same time
# avoiding catalog nodes depending on labels. (Metadata can now change after datasets are loaded,
# so any labels run the risk of being out of date)
dimension_catalog_nodes = _create_dataset_dimension_catalog_nodes(
row, dimension_id, given_id, dataset_catalog_node,
)
catalog_nodes.extend(dimension_catalog_nodes)

db.bulk_save_objects(dimensions)
db.flush()
add_catalog_nodes(db, catalog_nodes)
Expand Down Expand Up @@ -1412,34 +1396,6 @@ def delete_dataset(
return True


def _create_dataset_dimension_catalog_nodes(
dimension_row: pd.Series,
dimension_id: str,
dimension_label: str,
dataset_catalog_node: CatalogNode,
) -> List[CatalogNode]:
catalog_nodes: List[CatalogNode] = []

aliases_set = set()
aliases_set.add(dimension_label)

for alias in aliases_set:
catalog_nodes.append(
CatalogNode(
dataset_id=dataset_catalog_node.dataset_id,
dimension_id=dimension_id,
priority=0,
parent=dataset_catalog_node,
label=alias,
is_continuous=dataset_catalog_node.is_continuous,
is_categorical=dataset_catalog_node.is_categorical,
is_binary=dataset_catalog_node.is_binary,
is_text=dataset_catalog_node.is_text,
)
)
return catalog_nodes


def add_catalog_nodes(db: SessionWithUser, catalog_nodes: List[CatalogNode]):
for i in range(0, len(catalog_nodes), 10000): # arbitrary chunk size
chunk = i + 10000
Expand Down
15 changes: 0 additions & 15 deletions breadbox/breadbox/crud/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from breadbox.crud.dataset import (
ROOT_ID,
add_catalog_nodes,
_create_dataset_dimension_catalog_nodes,
delete_dataset,
get_properties_to_index,
populate_search_index,
Expand Down Expand Up @@ -367,16 +366,6 @@ def get_dataset_dimension_axis(
dataset_dimensions_with_dimension_type_query.session.connection(),
).groupby("dataset_id")
for dataset_key in dims_grouped_by_dataset_df.groups.keys():
dataset_catalog_node = (
db.query(CatalogNode)
.filter(
and_(
CatalogNode.dimension_id.is_(None),
CatalogNode.dataset_id == dataset_key,
)
)
.one()
)
dataset_dims_df = dims_grouped_by_dataset_df.get_group(dataset_key)
# Rename dimension 'id' column in case of collision in naming where dimension type identifier is also named 'id'
dataset_dims_df.rename(columns={"id": "dimension_id"}, inplace=True)
Expand All @@ -387,10 +376,6 @@ def get_dataset_dimension_axis(
dimension_label = (
str(row["label"]) if row["label"] is not None else row["given_id"]
)
new_dim_nodes = _create_dataset_dimension_catalog_nodes(
row, row["dimension_id"], dimension_label, dataset_catalog_node
)
new_dataset_dimension_catalog_nodes.extend(new_dim_nodes)
if dimension_type.axis == "feature":
if (
dimension_label != row["feature_label"]
Expand Down
23 changes: 4 additions & 19 deletions breadbox/breadbox/io/data_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def _get_dimension_type_given_ids(
db: SessionWithUser, dimension_type: DimensionType
) -> list[str]:
"""
Get entity labels and aliases from the metadata for the given dimension type.
Get the full list of dimension type given IDs by reading the ID column
of the dimension type metadata table.
"""
given_ids = (
db.query(TabularCell)
Expand All @@ -258,22 +259,6 @@ def _get_dimension_type_given_ids(
return [x for x, in given_ids]


def _get_dataset_dimensions_labels(
dataset_dimensions: Union[pd.Series, pd.Index],
dimension_type_entity_labels_df: pd.DataFrame,
) -> pd.DataFrame:
"""
For dataset matrix features or samples identifiers, get their canonical names ('label') and aliases (if any) and return it as a dataframe indexed by their ids
"""
dataset_dimensions_df: pd.DataFrame = dataset_dimensions.to_frame(name="given_id")
# NOTE: Merging at this stage means changes to dimension's label might not be reflected when stored in DatsetFeature or DatasetSample
dataset_dimensions_df = dataset_dimensions_df.merge(
dimension_type_entity_labels_df, "left", on="given_id"
)
dataset_dimensions_df.replace({np.nan: None}, inplace=True)
return dataset_dimensions_df


@dataclass
class DimensionLabelsAndWarnings:
given_id_to_index: pd.DataFrame
Expand All @@ -282,8 +267,8 @@ class DimensionLabelsAndWarnings:

@dataclass
class DataframeValidatedFile:
feature_labels_and_aliases: pd.DataFrame
sample_labels_and_aliases: pd.DataFrame
feature_given_id_and_index: pd.DataFrame
sample_given_id_and_index: pd.DataFrame
feature_warnings: List[str]
sample_warnings: List[str]

Expand Down
1 change: 0 additions & 1 deletion breadbox/breadbox/schemas/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,6 @@ class UploadDatasetResponse(BaseModel):
dataset: DatasetResponse
warnings: List[str]
forwardingUrl: Optional[str] = Field()
sliceId: Optional[str] = Field()


class UploadDatasetResponseV2(BaseModel):
Expand Down
2 changes: 0 additions & 2 deletions breadbox/pyright-ratchet-errors.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ types.py: error: Argument of type "Dict[str, AnnotationType] | None" cannot be a
types.py: error: Argument of type "Dict[str, AnnotationType] | None" cannot be assigned to parameter "annotation_type_mapping" of type "Dict[str, AnnotationType]" in function "validate_dimension_type_metadata"
types.py: error: Argument of type "Dict[str, Any] | None" cannot be assigned to parameter "reference_column_mappings" of type "Dict[str, str]" in function "check_id_mapping_is_valid"
types.py: error: Argument of type "Json[IdMappingInsanity] | None" cannot be assigned to parameter "id_mapping" of type "Dict[str, Any] | None" in function "__init__" (reportArgumentType)
types.py: error: Argument of type "Series | Unknown | ndarray[Any, Unknown] | Any | NDArray[Unknown]" cannot be assigned to parameter "dimension_id" of type "str" in function "_create_dataset_dimension_catalog_nodes"
types.py: error: Argument of type "Text" cannot be assigned to parameter "axis" of type "Literal['feature', 'sample']" in function "__init__"
types.py: error: Argument of type "str | None" cannot be assigned to parameter "file_name" of type "str" in function "update_dimension_type_metadata"
types.py: error: Argument of type "str | Series | Unknown | ndarray[Any, Unknown] | Any | NDArray[Unknown]" cannot be assigned to parameter "dimension_label" of type "str" in function "_create_dataset_dimension_catalog_nodes"
types.py: error: Cannot assign to attribute "dataset_id" for class "DimensionType"
types.py: error: Expression of type "IdMapping" is incompatible with declared type "Json[IdMappingInsanity] | None"
types.py: error: Expression of type "None" cannot be assigned to parameter of type "Dict[str, AnnotationType]"
Expand Down
53 changes: 20 additions & 33 deletions breadbox/tests/api/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@

from breadbox.db.session import SessionWithUser
from breadbox.models.dataset import (
AnnotationType,
CatalogNode,
DatasetFeature,
DatasetSample,
Dataset,
Dimension,
MatrixDataset,
TabularDataset,
TabularColumn,
TabularCell,
AnnotationType,
ValueType,
)
from fastapi.testclient import TestClient
from breadbox.api.dependencies import get_dataset
Expand Down Expand Up @@ -1349,9 +1352,6 @@ def test_add_dataset(
catalog_nodes = minimal_db.query(CatalogNode).all()
assert len(feature_indexes) == 3 # Number of feaures should be 3
assert len(sample_indexes) == 2 # Number of feaures should be 2
assert (
len(catalog_nodes) == 7
) # Number of nodes is features + samples + dataset + root

def test_add_dataset_no_write_access(
self,
Expand Down Expand Up @@ -1450,22 +1450,15 @@ def test_add_categorical_dataset(
result_dataset = r.json()["result"]["dataset"]
feature_indexes = minimal_db.query(DatasetFeature).all()
sample_indexes = minimal_db.query(DatasetSample).all()
catalog_nodes = minimal_db.query(CatalogNode).all()
dataset_node = (
minimal_db.query(CatalogNode)
.filter(
and_(
CatalogNode.dataset_id == result_dataset["id"],
CatalogNode.dimension_id.is_(None),
)
)
.one()
)
assert len(feature_indexes) == 3 # Number of feaures should be 3
assert len(sample_indexes) == 2 # Number of samples should be 2
assert len(catalog_nodes) == 7 # Number features + samples + root + dataset
assert dataset_node
assert dataset_node.is_categorical == True # and dataset_node.is_binary == True
categorical_dataset = (
minimal_db.query(MatrixDataset)
.filter(MatrixDataset.id == result_dataset["id"])
.one()
)
assert categorical_dataset
assert categorical_dataset.value_type == ValueType.categorical

def test_add_categorical_and_binary_dataset(
self, client: TestClient, minimal_db, private_group: Dict, mock_celery
Expand Down Expand Up @@ -1782,12 +1775,12 @@ def test_add_dataset_with_missing_values(
assert minimal_db.query(Dataset).filter_by(id=result_dataset["id"]).one()
assert (
len(
minimal_db.query(CatalogNode)
minimal_db.query(Dimension)
.filter_by(dataset_id=result_dataset["id"])
.all()
)
== 6
) # 3 features + 2 samples + 1 dataset node
== 5
) # 3 features + 2 samples

r_categorical = client.post(
"/datasets/?allowed_values=Thing1&allowed_values=Thing2&allowed_values=Thing3",
Expand Down Expand Up @@ -1820,23 +1813,17 @@ def test_add_dataset_with_missing_values(
assert minimal_db.query(Dataset).filter_by(id=result_dataset["id"]).one()
assert (
len(
minimal_db.query(CatalogNode)
minimal_db.query(Dimension)
.filter_by(dataset_id=result_dataset["id"])
.all()
)
== 6
) # 3 features + 2 samples + 1 dataset node
categorical_dataset_node = (
minimal_db.query(CatalogNode)
.filter(
and_(
CatalogNode.dataset_id == result_dataset["id"],
CatalogNode.dimension_id.is_(None),
)
)
== 5
) # 3 features + 2 samples
categorical_dataset = (
minimal_db.query(MatrixDataset)
.filter(MatrixDataset.value_type == ValueType.categorical)
.one()
)
categorical_dataset = categorical_dataset_node.dataset
feature_indices = [
tup[0]
for tup in minimal_db.query(DatasetFeature.index)
Expand Down
Loading

0 comments on commit 1418f65

Please sign in to comment.