Skip to content

Commit

Permalink
Adding uuid to dataset version (#587)
Browse files Browse the repository at this point in the history
* added uuid to dataset version

* returning all tests

* fixing pull and having same uuid locally
  • Loading branch information
ilongin authored Nov 13, 2024
1 parent 3f5e9f8 commit c5eb1d6
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/datachain/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,7 @@ def create_dataset(
create_rows: Optional[bool] = True,
validate_version: Optional[bool] = True,
listing: Optional[bool] = False,
uuid: Optional[str] = None,
) -> "DatasetRecord":
"""
Creates new dataset of a specific version.
Expand Down Expand Up @@ -816,6 +817,7 @@ def create_dataset(
query_script=query_script,
create_rows_table=create_rows,
columns=columns,
uuid=uuid,
)

def create_new_dataset_version(
Expand All @@ -832,6 +834,7 @@ def create_new_dataset_version(
script_output="",
create_rows_table=True,
job_id: Optional[str] = None,
uuid: Optional[str] = None,
) -> DatasetRecord:
"""
Creates dataset version if it doesn't exist.
Expand All @@ -855,6 +858,7 @@ def create_new_dataset_version(
schema=schema,
job_id=job_id,
ignore_if_exists=True,
uuid=uuid,
)

if create_rows_table:
Expand Down Expand Up @@ -1400,6 +1404,7 @@ def _instantiate_dataset():
columns=columns,
feature_schema=remote_dataset_version.feature_schema,
validate_version=False,
uuid=remote_dataset_version.uuid,
)

# asking remote to export dataset rows table to s3 and to return signed
Expand Down
4 changes: 4 additions & 0 deletions src/datachain/data_storage/metastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def create_dataset_version( # noqa: PLR0913
size: Optional[int] = None,
preview: Optional[list[dict]] = None,
job_id: Optional[str] = None,
uuid: Optional[str] = None,
) -> DatasetRecord:
"""Creates new dataset version."""

Expand Down Expand Up @@ -352,6 +353,7 @@ def _datasets_versions_columns(cls) -> list["SchemaItem"]:
"""Datasets versions table columns."""
return [
Column("id", Integer, primary_key=True),
Column("uuid", Text, nullable=False, default=uuid4()),
Column(
"dataset_id",
Integer,
Expand Down Expand Up @@ -545,6 +547,7 @@ def create_dataset_version( # noqa: PLR0913
size: Optional[int] = None,
preview: Optional[list[dict]] = None,
job_id: Optional[str] = None,
uuid: Optional[str] = None,
conn=None,
) -> DatasetRecord:
"""Creates new dataset version."""
Expand All @@ -555,6 +558,7 @@ def create_dataset_version( # noqa: PLR0913

query = self._datasets_versions_insert().values(
dataset_id=dataset.id,
uuid=uuid or str(uuid4()),
version=version,
status=status,
feature_schema=json.dumps(feature_schema or {}),
Expand Down
5 changes: 5 additions & 0 deletions src/datachain/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ class DatasetStatus:
@dataclass
class DatasetVersion:
id: int
uuid: str
dataset_id: int
version: int
status: int
Expand All @@ -184,6 +185,7 @@ class DatasetVersion:
def parse( # noqa: PLR0913
cls: type[V],
id: int,
uuid: str,
dataset_id: int,
version: int,
status: int,
Expand All @@ -203,6 +205,7 @@ def parse( # noqa: PLR0913
):
return cls(
id,
uuid,
dataset_id,
version,
status,
Expand Down Expand Up @@ -306,6 +309,7 @@ def parse( # noqa: PLR0913
query_script: str,
schema: str,
version_id: int,
version_uuid: str,
version_dataset_id: int,
version: int,
version_status: int,
Expand All @@ -331,6 +335,7 @@ def parse( # noqa: PLR0913

dataset_version = DatasetVersion.parse(
version_id,
version_uuid,
version_dataset_id,
version,
version_status,
Expand Down
3 changes: 3 additions & 0 deletions src/datachain/lib/dataset_info.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from datetime import datetime
from typing import TYPE_CHECKING, Any, Optional, Union
from uuid import uuid4

from pydantic import Field, field_validator

Expand All @@ -15,6 +16,7 @@

class DatasetInfo(DataModel):
name: str
uuid: str = Field(default=str(uuid4()))
version: int = Field(default=1)
status: int = Field(default=DatasetStatus.CREATED)
created_at: datetime = Field(default=TIME_ZERO)
Expand Down Expand Up @@ -60,6 +62,7 @@ def from_models(
job: Optional[Job],
) -> "Self":
return cls(
uuid=version.uuid,
name=dataset.name,
version=version.version,
status=version.status,
Expand Down
4 changes: 4 additions & 0 deletions tests/func/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def test_create_dataset_no_version_specified(cloud_test_catalog, create_rows):
assert dataset.schema["similarity"] == Float32
assert dataset_version.schema["similarity"] == Float32
assert dataset_version.status == DatasetStatus.PENDING
assert dataset_version.uuid
assert dataset.status == DatasetStatus.CREATED # dataset status is deprecated
if create_rows:
assert dataset_version.num_objects == 0
Expand Down Expand Up @@ -85,6 +86,7 @@ def test_create_dataset_with_explicit_version(cloud_test_catalog, create_rows):
assert dataset.schema["similarity"] == Float32
assert dataset_version.schema["similarity"] == Float32
assert dataset_version.status == DatasetStatus.PENDING
assert dataset_version.uuid
assert dataset.status == DatasetStatus.CREATED
if create_rows:
assert dataset_version.num_objects == 0
Expand Down Expand Up @@ -178,6 +180,7 @@ def test_create_dataset_from_sources(listed_bucket, cloud_test_catalog):
assert dataset_version.error_stack == ""
assert dataset_version.script_output == ""
assert dataset_version.sources == f"{src_uri}/dogs/*"
assert dataset_version.uuid

dr = catalog.warehouse.schema.dataset_row_cls
sys_schema = {c.name: type(c.type) for c in dr.sys_columns()}
Expand Down Expand Up @@ -214,6 +217,7 @@ def test_create_dataset_from_sources_dataset(cloud_test_catalog, dogs_dataset):
assert dataset_version.error_stack == ""
assert dataset_version.script_output == ""
assert dataset_version.sources == f"ds://{dogs_dataset.name}"
assert dataset_version.uuid

dr = catalog.warehouse.schema.dataset_row_cls
sys_schema = {c.name: type(c.type) for c in dr.sys_columns()}
Expand Down
4 changes: 4 additions & 0 deletions tests/func/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from tests.data import ENTRIES
from tests.utils import assert_row_names, skip_if_not_sqlite

DATASET_UUID = "20f5a2f1-fc9a-4e36-8b91-5a530f289451"


@pytest.fixture(autouse=True)
def studio_config():
Expand Down Expand Up @@ -90,6 +92,7 @@ def schema():
def remote_dataset_version(schema, dataset_rows):
return {
"id": 1,
"uuid": DATASET_UUID,
"dataset_id": 1,
"version": 1,
"status": 4,
Expand Down Expand Up @@ -179,6 +182,7 @@ def test_pull_dataset_success(
assert dataset_version.schema
assert dataset_version.num_objects == 4
assert dataset_version.size == 15
assert dataset_version.uuid == DATASET_UUID

assert_row_names(
catalog,
Expand Down

0 comments on commit c5eb1d6

Please sign in to comment.