Migrate to KfP V2 (#477)

Co-authored-by: Philippe Moussalli <[email protected]> Co-authored-by: Robbe Sneyders <[email protected]>
ml6team · Oct 11, 2023 · 6b46324 · 6b46324
1 parent 8faccb4
commit 6b46324
Show file tree

Hide file tree

Showing 53 changed files with 2,440 additions and 1,256 deletions.
diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11']
+        python-version: ['3.8', '3.9', '3.10']
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -56,11 +56,11 @@ repos:
           - types-requests
         pass_filenames: false
 
-  - repo: local
-    hooks:
-      - id: generate_component_readmes
-        name: Generate component READMEs
-        language: python
-        entry: python scripts/component_readme/generate_readme.py
-        files: ^components/.*/fondant_component.yaml
-        additional_dependencies: ["fondant"]
+#  - repo: local
+#    hooks:
+#      - id: generate_component_readmes
+#        name: Generate component READMEs
+#        language: python
+#        entry: python scripts/component_readme/generate_readme.py
+#        files: ^components/.*/fondant_component.yaml
+#        additional_dependencies: ["fondant"]
diff --git a/components/download_images/fondant_component.yaml b/components/download_images/fondant_component.yaml
@@ -54,12 +54,12 @@ args:
   resize_only_if_bigger: 
     description: If True, resize only if image is bigger than image_size.
     type: bool
-    default: 'False'
+    default: False
   min_image_size:
     description: Minimum size of the images.
     type: int
     default: 0
   max_aspect_ratio:
     description: Maximum aspect ratio of the images.
     type: float
-    default: 'inf'
+    default: 99.9
diff --git a/components/load_from_hf_hub/fondant_component.yaml b/components/load_from_hf_hub/fondant_component.yaml
@@ -19,11 +19,11 @@ args:
     description: Optional argument, a list containing the original image column names in case the 
       dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
     type: list
-    default: None
+    default: []
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: None
+    default: -1
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str

diff --git a/components/load_from_hf_hub/src/main.py b/components/load_from_hf_hub/src/main.py
@@ -1,6 +1,5 @@
 """This component loads a seed dataset from the hub."""
 import logging
-import typing as t
 
 import dask
 import dask.dataframe as dd
@@ -20,9 +19,9 @@ def __init__(self,
                  *_,
                  dataset_name: str,
                  column_name_mapping: dict,
-                 image_column_names: t.Optional[list],
-                 n_rows_to_load: t.Optional[int],
-                 index_column: t.Optional[str],
+                 image_column_names: list,
+                 n_rows_to_load: int,
+                 index_column: str,
                  ) -> None:
         """
         Args:
@@ -60,7 +59,7 @@ def load(self) -> dd.DataFrame:
         dask_df = dask_df.rename(columns=self.column_name_mapping)
 
         # 4) Optional: only return specific amount of rows
-        if self.n_rows_to_load is not None:
+        if self.n_rows_to_load > 0:
             partitions_length = 0
             npartitions = 1
             for npartitions, partition in enumerate(dask_df.partitions, start=1):
@@ -73,7 +72,7 @@ def load(self) -> dd.DataFrame:
             dask_df = dd.from_pandas(dask_df, npartitions=npartitions)
 
         # 4) Set the index
-        if self.index_column is None:
+        if self.index_column == "None":
             logger.info(
                 "Index column not specified, setting a globally unique index",
             )

diff --git a/components/load_from_parquet/fondant_component.yaml b/components/load_from_parquet/fondant_component.yaml
@@ -15,11 +15,11 @@ args:
   column_name_mapping:
     description: Mapping of the consumed dataset
     type: dict
-    default: None
+    default: {}
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: None
+    default: -1
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str

diff --git a/components/load_from_parquet/src/main.py b/components/load_from_parquet/src/main.py
@@ -19,8 +19,8 @@ def __init__(self,
                  spec: ComponentSpec,
                  *_,
                  dataset_uri: str,
-                 column_name_mapping: t.Optional[dict],
-                 n_rows_to_load: t.Optional[int],
+                 column_name_mapping: dict,
+                 n_rows_to_load: int,
                  index_column: t.Optional[str],
                  ) -> None:
         """
@@ -45,12 +45,12 @@ def load(self) -> dd.DataFrame:
         dask_df = dd.read_parquet(self.dataset_uri)
 
         # 2) Rename columns
-        if self.column_name_mapping is not None:
+        if self.column_name_mapping:
             logger.info("Renaming columns...")
             dask_df = dask_df.rename(columns=self.column_name_mapping)
 
         # 3) Optional: only return specific amount of rows
-        if self.n_rows_to_load is not None:
+        if self.n_rows_to_load > 0:
             partitions_length = 0
             npartitions = 1
             for npartitions, partition in enumerate(dask_df.partitions, start=1):

diff --git a/components/segment_images/fondant_component.yaml b/components/segment_images/fondant_component.yaml
@@ -22,4 +22,4 @@ args:
   batch_size:
     description: batch size to use
     type: int
-    batch_size: 8
+    default: 8
diff --git a/components/write_to_hf_hub/fondant_component.yaml b/components/write_to_hf_hub/fondant_component.yaml
@@ -21,8 +21,8 @@ args:
   image_column_names:
     description: A list containing the image column names. Used to format to image to HF hub format
     type: list
-    default: None
+    default: []
   column_name_mapping:
     description: Mapping of the consumed fondant column names to the written hub column names
     type: dict
-    default: None
+    default: {}
diff --git a/components/write_to_hf_hub/src/main.py b/components/write_to_hf_hub/src/main.py
@@ -39,8 +39,8 @@ def __init__(self,
             hf_token: str,
             username: str,
             dataset_name: str,
-            image_column_names: t.Optional[list],
-            column_name_mapping: t.Optional[dict],
+            image_column_names: list,
+            column_name_mapping: dict,
     ):
         """
         Args:
@@ -87,7 +87,7 @@ def write(
         # Map image column to hf data format
         feature_encoder = datasets.Image(decode=True)
 
-        if self.image_column_names is not None:
+        if self.image_column_names:
             for image_column_name in self.image_column_names:
                 dataframe[image_column_name] = dataframe[image_column_name].map(
                     lambda x: convert_bytes_to_image(x, feature_encoder),

diff --git a/docs/components/component_spec.md b/docs/components/component_spec.md
@@ -127,9 +127,6 @@ The `args` section describes which arguments the component takes. Each argument
 `description` and a `type`, which should be one of the builtin Python types. Additionally, you can
 set an optional `default` value for each argument.
 
-_Note:_ default iterable arguments such as `dict` and `list` have to be passed as a string
-(e.g. `'{"foo":1, "bar":2}`, `'["foo","bar]'`)
-
 ```yaml
 args:
   custom_argument:

diff --git a/docs/pipeline.md b/docs/pipeline.md
@@ -30,7 +30,8 @@ def build_pipeline():
             "batch_size": 2,  
             "max_new_tokens": 50,  
         },  
-        number_of_gpus=1,
+        number_of_accelerators=1,
+        accelerator_name="GPU",
         node_pool_label="node_pool",  
         node_pool_name="model-inference-pool",  
     )

diff --git a/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml b/examples/pipelines/commoncrawl/components/extract_images_from_warc/fondant_component.yaml
@@ -26,4 +26,4 @@ args:
   extract_plain_text:
     description: If set to true the data contains the plain text without html tags
     type: bool
-    default: "False"
+    default: False
diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml b/examples/pipelines/commoncrawl/components/read_warc_paths/fondant_component.yaml
@@ -15,4 +15,4 @@ args:
   n_records_to_download:
     description: Number of records to download
     type: int
-    default: None
+    default: -1
diff --git a/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py b/examples/pipelines/commoncrawl/components/read_warc_paths/src/main.py
@@ -18,7 +18,7 @@ def __init__(
         self,
         *_,
         common_crawl_indices: t.List[str],
-        n_records_to_download: t.Optional[int] = None,
+        n_records_to_download: int,
     ):
         self.index_urls = [
             self.build_index_url(index_name) for index_name in common_crawl_indices
@@ -38,7 +38,7 @@ def load(self) -> dd.DataFrame:
             warc_urls.extend([line.decode() for line in extracted.split(b"\n")])
 
         df = pd.DataFrame(warc_urls, columns=["warc_url"])
-        if self.n_records_to_download is not None:
+        if self.n_records_to_download > 0:
             df = df.head(self.n_records_to_download)
 
         return dd.from_pandas(df, npartitions=len(df) // 100)
diff --git a/...s/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml b/...s/pipelines/controlnet-interior-design/components/generate_prompts/fondant_component.yaml
@@ -12,4 +12,4 @@ args:
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: None
+    default: -1
diff --git a/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py b/examples/pipelines/controlnet-interior-design/components/generate_prompts/src/main.py
@@ -114,7 +114,7 @@ def load(self) -> dd.DataFrame:
 
         pandas_df = pd.DataFrame(prompts, columns=["prompts_text"])
 
-        if self.n_rows_to_load:
+        if self.n_rows_to_load > 0:
             pandas_df = pandas_df.head(self.n_rows_to_load)
 
         df = dd.from_pandas(pandas_df, npartitions=1)

diff --git a/...ines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml b/...ines/controlnet-interior-design/components/write_to_hub_controlnet/fondant_component.yaml
@@ -31,8 +31,8 @@ args:
   image_column_names:
     description: A list containing the image column names. Used to format to image to HF hub format
     type: list
-    default: None
+    default: []
   column_name_mapping:
     description: Mapping of the consumed fondant column names to the written hub column names
     type: dict
-    default: None
+    default: {}
diff --git a/examples/pipelines/controlnet-interior-design/pipeline.py b/examples/pipelines/controlnet-interior-design/pipeline.py
@@ -45,15 +45,17 @@
         "batch_size": 2,
         "max_new_tokens": 50,
     },
-    number_of_gpus=1,
+    number_of_accelerators=1,
+    accelerator_name="GPU",
 )
 segment_images_op = ComponentOp.from_registry(
     name="segment_images",
     arguments={
         "model_id": "openmmlab/upernet-convnext-small",
         "batch_size": 2,
     },
-    number_of_gpus=1,
+    number_of_accelerators=1,
+    accelerator_name="GPU",
 )
 
 write_to_hub_controlnet = ComponentOp(

diff --git a/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml b/examples/pipelines/datacomp/components/add_clip_score/fondant_component.yaml
@@ -16,7 +16,6 @@ consumes:
         items:
           type: float32
 
-
 produces:
   imagetext:
     fields:

diff --git a/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml b/examples/pipelines/datacomp/components/load_from_hf_hub/fondant_component.yaml
@@ -50,11 +50,11 @@ args:
     description: Optional argument, a list containing the original image column names in case the 
       dataset on the hub contains them. Used to format the image from HF hub format to a byte string.
     type: list
-    default: None
+    default: []
   n_rows_to_load:
     description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale
     type: int
-    default: None
+    default: -1
   index_column:
     description: Column to set index to in the load component, if not specified a default globally unique index will be set
     type: str