Implementation new dataset interface (#902)

First steps for the implementation of the new dataset interface: - Removed Pipeline class - Added Workspace singleton to hold pipeline name, base_path, etc. .. (shouldn't be the focus of this PR) - Moved `Pipeline.read(..)` to Dataset class
ml6team · Mar 21, 2024 · f9238e2 · f9238e2
1 parent ea61a82
commit f9238e2
Show file tree

Hide file tree

Showing 24 changed files with 652 additions and 468 deletions.
diff --git a/docs/components/component_spec.md b/docs/components/component_spec.md
@@ -102,7 +102,7 @@ If your dataset has a field called `custom_text` with type `string`, you can map
 
 ```python 
 
-dataset = pipeline.read(...)
+dataset = Dataset.read(...)
 dataset = dataset.apply(
     "example_component",
     consumes={
@@ -127,7 +127,7 @@ so as follows:
 
 ```python 
 
-dataset = pipeline.read(...)
+dataset = Dataset.read(...)
 dataset = dataset.apply(
     "example_component",
     produces={
@@ -165,7 +165,7 @@ in the component specification, so we will need to specify the schema of the
 fields when defining the components
 
 ```python
-dataset = pipeline.read(
+dataset = Dataset.read(
     "load_from_csv",
     arguments={
         "dataset_uri": "path/to/dataset.csv",
@@ -196,7 +196,7 @@ by the next component. We can either load the `image` field:
 
 ```python 
 
-dataset = pipeline.read(
+dataset = Dataset.read(
     "load_from_csv",
     arguments={
         "dataset_uri": "path/to/dataset.csv",
@@ -219,7 +219,7 @@ or the `embedding` field:
 
 ```python 
 
-dataset = pipeline.read(
+dataset = Dataset.read(
     "load_from_csv",
     arguments={
         "dataset_uri": "path/to/dataset.csv",
@@ -268,7 +268,7 @@ These arguments are passed in when the component is instantiated.
 If an argument is not explicitly provided, the default value will be used instead if available.
 
 ```python
-dataset = pipeline.read(
+dataset = Dataset.read(
     "custom_component",
     arguments={
         "custom_argument": "foo"

diff --git a/docs/components/lightweight_components.md b/docs/components/lightweight_components.md
@@ -53,7 +53,7 @@ pipeline = Pipeline(
     base_path="./data",
 )
 
-dataset = pipeline.read(
+dataset = Dataset.read(
     ref=CreateData,
 )
 

diff --git a/docs/guides/build_a_simple_pipeline.md b/docs/guides/build_a_simple_pipeline.md
@@ -44,7 +44,7 @@ pipeline = Pipeline(
 
 ??? "View a detailed reference of the options accepted by the `Pipeline` class"
 
-    ::: fondant.dataset.Pipeline.__init__
+    ::: fondant.dataset.Dataset.__init__
         handler: python
         options:
             show_source: false
@@ -69,13 +69,13 @@ As a first step, we want to read data into our pipeline. In this case, we will l
 from the HuggingFace Hub. For this, we can use the reusable 
 [load_from_hf_hub](../components/hub.md#load_from_hugging_face_hub#description) component.
 
-We can read data into our pipeline using the `Pipeline.read()` method, which returns a (lazy) 
+We can read data into our pipeline using the `Dataset.read()` method, which returns a (lazy) 
 `Dataset`.
 
 ```python
 import pyarrow as pa
 
-dataset = pipeline.read(
+dataset = Dataset.read(
     "load_from_hf_hub",
     arguments={
         "dataset_name": "fondant-ai/fondant-cc-25m",
@@ -101,9 +101,9 @@ We provide three arguments to the `.read()` method:
   defined in the component [documentation](../components/hub.md#load_from_hugging_face_hub#inputs_outputs) with 
   `additionalProperties: true` under the produces section.
 
-??? "View a detailed reference of the `Pipeline.read()` method"
+??? "View a detailed reference of the `Dataset.read()` method"
 
-    ::: fondant.dataset.Pipeline.read
+    ::: fondant.dataset.Dataset.read
         handler: python
         options:
             show_source: false

diff --git a/docs/guides/implement_custom_components.md b/docs/guides/implement_custom_components.md
@@ -33,7 +33,7 @@ pipeline = Pipeline(
     base_path="./data"
 )
 
-dataset = pipeline.read(
+dataset = Dataset.read(
     "load_from_hf_hub",
     arguments={
         "dataset_name": "fondant-ai/fondant-cc-25m",

diff --git a/docs/pipeline.md b/docs/pipeline.md
@@ -10,12 +10,12 @@ components and custom components, and chain them together.
 
 Start by creating a `pipeline.py` file and adding the following code.
 ```python
-from fondant.pipeline import Pipeline
+from fondant.dataset import Dataset
+
+#dataset = Dataset.read(
+#    ..
+#)
 
-pipeline = Pipeline(
-    name="my-pipeline",
-    base_path="./data",
-)
 ```
 
 We identify our pipeline with a name and provide a base path where the pipeline will store its 
@@ -49,7 +49,7 @@ dataset = Dataset.read(
 
 ??? "View a detailed reference of the `Dataset.read()` method"
 
-    ::: fondant.dataset.Pipeline.read
+    ::: fondant.dataset.Dataset.read
         handler: python
         options:
             show_source: false

diff --git a/examples/sample_pipeline/pipeline.py b/examples/sample_pipeline/pipeline.py
@@ -8,30 +8,29 @@
 import pyarrow as pa
 
 from fondant.component import PandasTransformComponent
-from fondant.pipeline import Pipeline, lightweight_component
+from fondant.dataset import Workspace, lightweight_component, Dataset
 
 BASE_PATH = Path("./.artifacts").resolve()
 
 # Define pipeline
-pipeline = Pipeline(name="dummy-pipeline", base_path=str(BASE_PATH))
+workspace = Workspace(name="dummy-pipeline", base_path=str(BASE_PATH))
 
 # Load from hub component
 load_component_column_mapping = {
     "text": "text_data",
 }
 
-dataset = pipeline.read(
+dataset = Dataset.read(
     "load_from_parquet",
     arguments={
         "dataset_uri": "/data/sample.parquet",
         "column_name_mapping": load_component_column_mapping,
     },
     produces={"text_data": pa.string()},
+    workspace=workspace,
 )
 
-dataset = dataset.apply(
-    "./components/dummy_component",
-)
+dataset = dataset.apply("./components/dummy_component")
 
 dataset = dataset.apply(
     "chunk_text",
@@ -63,5 +62,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
 )
 
 dataset.write(
-    ref="write_to_file", arguments={"path": "/data/export"}, consumes={"text": "text"}
+    ref="write_to_file",
+    arguments={"path": "/data/export"},
+    consumes={"text": "text"},
 )