diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile
index d4769187b..5fde2bb07 100644
--- a/data-processing-lib/spark/Makefile
+++ b/data-processing-lib/spark/Makefile
@@ -11,9 +11,14 @@ setup::
 
 set-versions: .check-env
 	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
-	sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/'				\
-	    pyproject.toml > tt.toml
-	mv tt.toml pyproject.toml
+	if [ -e pyproject.toml ]; then					\
+		cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \
+		mv tt.toml pyproject.toml; \
+	fi
+	if [ -e requirements.txt ]; then					\
+		cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \
+		mv tt.txt requirements.txt; \
+	fi
 
 build:: build-dist 
 
@@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist
 
 publish-image:: .defaults.publish-image
 
-venv::  pyproject.toml
+venv::
 	$(MAKE) .defaults.spark-lib-src-venv
 	pip install pytest pytest-cov 
 
diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh
index cb7b1ee10..7054a7b9a 100755
--- a/scripts/check-workflows.sh
+++ b/scripts/check-workflows.sh
@@ -17,7 +17,7 @@ if [ ! -d transforms ]; then
     echo Please run this script from the top of the repository
     exit 1
 fi
-KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser"
+KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup"
 while [ $# -ne 0 ]; do
    case $1 in
         -show-kfp-black-list)    echo $KFP_BLACK_LIST; exit 0;
diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 3156ab6f1..683f93210 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -14,14 +14,24 @@
 import kfp.compiler as compiler
 import kfp.components as comp
 import kfp.dsl as dsl
-from src.fdedup_compute_execution_params import fdedup_compute_execution_params
+from src.fdedup_compute_execution_params import (
+    cluster_analysis_compute_execution_params,
+    compute_common_params,
+    data_cleaning_compute_execution_params,
+    get_duplicate_list_compute_execution_params,
+    signature_calc_compute_execution_params,
+)
 from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils
 
 
-task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
+task_image = os.getenv("FDEDUP_IMAGE_LOCATION", "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest")
+image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret")
 
 # the name of the job script
-EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py"
+SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py"
+CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py"
+GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py"
+DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py"
 
 # components
 base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
@@ -40,8 +50,18 @@
     # compilation time.
     import uuid
 
-    compute_exec_params_op = dsl.component_decorator.component(
-        func=fdedup_compute_execution_params, base_image=base_kfp_image
+    compute_common_params_op = dsl.component_decorator.component(func=compute_common_params, base_image=base_kfp_image)
+    compute_signature_calc_exec_params_op = dsl.component_decorator.component(
+        func=signature_calc_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_cluster_analysis_exec_params_op = dsl.component_decorator.component(
+        func=cluster_analysis_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_get_duplicate_list_exec_params_op = dsl.component_decorator.component(
+        func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_data_cleaning_exec_params_op = dsl.component_decorator.component(
+        func=data_cleaning_compute_execution_params, base_image=base_kfp_image
     )
     print(
         "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the "
@@ -49,61 +69,95 @@
     )
     run_id = uuid.uuid4().hex
 else:
-    compute_exec_params_op = comp.create_component_from_func(
-        func=fdedup_compute_execution_params, base_image=base_kfp_image
+    compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image)
+    compute_signature_calc_exec_params_op = comp.create_component_from_func(
+        func=signature_calc_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_cluster_analysis_exec_params_op = comp.create_component_from_func(
+        func=cluster_analysis_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_get_duplicate_list_exec_params_op = comp.create_component_from_func(
+        func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_data_cleaning_exec_params_op = comp.create_component_from_func(
+        func=data_cleaning_compute_execution_params, base_image=base_kfp_image
     )
     run_id = dsl.RUN_ID_PLACEHOLDER
 
 # create Ray cluster
 create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml")
-# execute job
-execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# execute signature calculation job
+execute_signature_calc_job_op = comp.load_component_from_file(
+    component_spec_path + "executeRayJobComponent_multi_s3.yaml"
+)
+# execute cluster analysis job
+execute_cluster_analysis_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# execute get duplicate list job
+execute_get_duplicate_list_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# execute data cleaning job
+execute_data_cleaning_job_op = comp.load_component_from_file(
+    component_spec_path + "executeRayJobComponent_multi_s3.yaml"
+)
 # clean up Ray
 cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml")
 
 # Task name is part of the pipeline name, the ray cluster name and the job name in DMF.
-TASK_NAME: str = "fdedup"
+TASK_NAME: str = "fuzzydedup"
 
 
 @dsl.pipeline(
     name=TASK_NAME + "-ray-pipeline",
-    description="Pipeline for fdedup",
+    description="Pipeline for fuzzy dedup",
 )
-def fdedup(
+def fuzzydedup(
+    # folders used
     # Ray cluster
-    ray_name: str = "fdedup-kfp-ray",  # name of Ray cluster
+    ray_name: str = "fuzzydedup-kfp-ray",  # name of Ray cluster
     # Add image_pull_secret and image_pull_policy to ray workers if needed
-    ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
-    ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
+    ray_head_options: dict = {
+        "cpu": 1,
+        "memory": 4,
+        "image": task_image,
+        "image_pull_secret": image_pull_secret,
+        "imagePullPolicy": "Always",
+    },
+    ray_worker_options: dict = {
+        "replicas": 2,
+        "max_replicas": 2,
+        "min_replicas": 2,
+        "cpu": 2,
+        "memory": 4,
+        "image": task_image,
+        "image_pull_secret": image_pull_secret,
+        "imagePullPolicy": "Always",
+    },
     server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
     # data access. checkpointing is not supported by dedup
-    data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}",
-    data_s3_access_secret: str = "s3-secret",
+    data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}",
+    data_s3_access_secret: str = "s3-south-secret",
+    scdata_s3_access_secret: str = "s3-south-secret",
+    dcdata_s3_access_secret: str = "s3-south-secret",
     data_max_files: int = -1,
     data_num_samples: int = -1,
     # orchestrator
-    runtime_actor_options: dict = {"num_cpus": 0.7},
     runtime_pipeline_id: str = "pipeline_id",
-    runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
+    runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
     # columns used
-    fdedup_doc_column: str = "contents",
-    fdedup_id_column: str = "int_id_column",
-    fdedup_cluster_column: str = "cluster",
-    # infrastructure
-    fdedup_bucket_cpu: float = 0.5,
-    fdedup_doc_cpu: float = 0.5,
-    fdedup_mhash_cpu: float = 0.5,
+    fdedup_contents_column: str = "contents",
+    fdedup_document_id_column: str = "int_id_column",
     # fuzzy parameters
-    fdedup_num_permutations: int = 64,
-    fdedup_threshold: float = 0.8,
-    fdedup_shingles_size: int = 5,
-    fdedup_delimiters: str = " ",
-    # Random delay between reads
-    fdedup_random_delay_limit: int = 5,
-    # snapshotting
-    fdedup_snapshot_delay: int = 1,
-    fdedup_use_doc_snapshot: bool = False,
-    fdedup_use_bucket_snapshot: bool = False,
+    fdedup_num_permutations: int = 112,
+    fdedup_num_bands: int = 14,
+    fdedup_num_minhashes_per_band: int = 8,
+    fdedup_word_shingle_size: int = 5,
+    fdedup_shingle_option: str = "word",
+    fdedup_jaccard_similarity_threshold: float = 0.75,
+    fdedup_seed: int = 42,
+    fdedup_docs_to_remove_folder: str = "docs_to_remove",
+    fdedup_duplicate_list_location: str = os.path.join(
+        "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
+    ),
+    fdedup_operation_mode: str = "annotate",
     # data sampling
     fdedup_n_samples: int = 10,
     # additional parameters
@@ -136,63 +190,47 @@ def fdedup(
         wait_print_tmout - time between prints, sec
         http_retries - http retries for API server calls
     :param data_s3_access_secret - s3 access secret
+    :param scdata_s3_access_secret - signature calculation s3 access secret
+    :param dcdata_s3_access_secret - data cleaning s3 access secret
     :param data_s3_config - s3 configuration
     :param data_max_files - max files to process
     :param data_num_samples - num samples to process
-    :param runtime_actor_options - actor options
     :param runtime_pipeline_id - pipeline id
     :param runtime_code_location - code location
-    :param fdedup_doc_column - document column name
-    :param fdedup_id_column - integer document id column name
-    :param fdedup_cluster_column - cluster column name
-    :param fdedup_bucket_cpu - number of CPUs per bucket hash
-    :param fdedup_doc_cpu - number of CPUs per doc hash
-    :param fdedup_mhash_cpu - number of CPUs per minhash hash
+    :param fdedup_contents_column - document column name
+    :param fdedup_document_id_column - integer document id column name
     :param fdedup_num_permutations - number of permutations
-    :param fdedup_threshold - threshold
-    :param fdedup_shingles_size - number of words in shingle
-    :param fdedup_delimiters - delimiter for splitting document
-    :param fdedup_random_delay_limit - delay between reads to reduce S3 load.
-                                A random number between 0 and random_delay_limit is used
-    :param fdedup_snapshot_delay - delay between restoring individual actors
-    :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots
-    :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots
+    :param fdedup_num_bands - number of bands
+    :param fdedup_num_minhashes_per_band - length of a band
+    :param fdedup_word_shingle_size - length of word shingles
+    :param fdedup_shingle_option - type of shingle, one of 'word', or 'char'
+    :param fdedup_jaccard_similarity_threshold - similarity threshold
+    :param fdedup_seed - seed for the random number generator
+    :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids
+    :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates
+    :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'
     :param fdedup_n_samples - number of samples for parameters computation
     :return: None
     """
     # create clean_up task
-    clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params)
+    clean_up_task = cleanup_ray_op(
+        ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
+    )
     ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
     # pipeline definition
     with dsl.ExitHandler(clean_up_task):
         # compute execution params
-        compute_exec_params = compute_exec_params_op(
+        compute_common_exec_params = compute_common_params_op(
             worker_options=ray_worker_options,
-            actor_options=runtime_actor_options,
             data_s3_config=data_s3_config,
-            data_max_files=data_max_files,
-            data_num_samples=data_num_samples,
-            runtime_pipeline_id=runtime_pipeline_id,
-            runtime_job_id=run_id,
-            runtime_code_location=runtime_code_location,
-            doc_column=fdedup_doc_column,
-            id_column=fdedup_id_column,
-            cluster_column=fdedup_cluster_column,
-            bucket_cpu=fdedup_bucket_cpu,
-            doc_cpu=fdedup_doc_cpu,
-            mhash_cpu=fdedup_mhash_cpu,
             num_permutations=fdedup_num_permutations,
-            threshold=fdedup_threshold,
-            shingles_size=fdedup_shingles_size,
-            delimiters=fdedup_delimiters,
-            random_delay_limit=fdedup_random_delay_limit,
-            snapshot_delay=fdedup_snapshot_delay,
-            use_doc_snapshot=fdedup_use_doc_snapshot,
-            use_bucket_snapshot=fdedup_use_bucket_snapshot,
             n_samples=fdedup_n_samples,
         )
-        ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
-        ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret)
+        ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2)
+        ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret)
+        fdedup_num_segments = compute_common_exec_params.outputs["num_segments"]
+        runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"]
+        runtime_num_actors = compute_common_exec_params.outputs["num_actors"]
 
         # start Ray cluster
         ray_cluster = create_ray_op(
@@ -204,21 +242,147 @@ def fdedup(
             additional_params=additional_params,
         )
         ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2)
-        ray_cluster.after(compute_exec_params)
+        ray_cluster.after(compute_common_exec_params)
+
+        # Get the parameters for the signature calculation job
+        compute_signature_calc_exec_params = compute_signature_calc_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            doc_column=fdedup_contents_column,
+            id_column=fdedup_document_id_column,
+            num_permutations=fdedup_num_permutations,
+            num_bands=fdedup_num_bands,
+            num_minhashes_per_band=fdedup_num_minhashes_per_band,
+            word_shingle_size=fdedup_word_shingle_size,
+            shingle_option=fdedup_shingle_option,
+            threshold=fdedup_jaccard_similarity_threshold,
+            num_segments=fdedup_num_segments,
+            seed=fdedup_seed,
+        )
+        ComponentUtils.add_settings_to_component(compute_signature_calc_exec_params, ONE_HOUR_SEC * 2)
+        compute_signature_calc_exec_params.after(ray_cluster)
+
+        # Execute signature calculation job
+        execute_signature_calc_job = execute_signature_calc_job_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            additional_params=additional_params,
+            exec_params=compute_signature_calc_exec_params.output,
+            exec_script_name=SIGNATURE_CALC_EXEC_SCRIPT_NAME,
+            server_url=server_url,
+            prefix="scdata",
+        )
+        ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC)
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") != "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret)
+            ComponentUtils.set_s3_env_vars_to_component(
+                execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata"
+            )
+        execute_signature_calc_job.after(compute_signature_calc_exec_params)
+
+        # Get the parameters for the cluster analysis job
+        compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            num_bands=fdedup_num_bands,
+            threshold=fdedup_jaccard_similarity_threshold,
+            num_segments=fdedup_num_segments,
+        )
+        ComponentUtils.add_settings_to_component(compute_cluster_analysis_exec_params, ONE_HOUR_SEC * 2)
+        compute_cluster_analysis_exec_params.after(execute_signature_calc_job)
+        # Execute job
+        execute_cluster_analysis_job = execute_cluster_analysis_job_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            additional_params=additional_params,
+            exec_params=compute_cluster_analysis_exec_params.output,
+            exec_script_name=CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME,
+            server_url=server_url,
+        )
+        ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC)
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") != "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret)
+        execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params)
+
+        compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            duplicate_docids_folder=fdedup_docs_to_remove_folder,
+            duplicate_list_location=fdedup_duplicate_list_location,
+        )
+        ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2)
+        compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job)
+        # Execute job
+        execute_get_duplicate_list_job = execute_get_duplicate_list_job_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            additional_params=additional_params,
+            exec_params=compute_get_duplicate_list_exec_params.output,
+            exec_script_name=GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME,
+            server_url=server_url,
+        )
+        ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC)
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") != "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret)
+        execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params)
+
+        compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            id_column=fdedup_document_id_column,
+            duplicate_list_location=fdedup_duplicate_list_location,
+            operation_mode=fdedup_operation_mode,
+        )
+        ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2)
+        compute_data_cleaning_exec_params.after(execute_get_duplicate_list_job)
+
         # Execute job
-        execute_job = execute_ray_jobs_op(
+        execute_data_cleaning_job = execute_data_cleaning_job_op(
             ray_name=ray_name,
             run_id=run_id,
             additional_params=additional_params,
-            exec_params=compute_exec_params.output,
-            exec_script_name=EXEC_SCRIPT_NAME,
+            exec_params=compute_data_cleaning_exec_params.output,
+            exec_script_name=DATA_CLEANING_EXEC_SCRIPT_NAME,
             server_url=server_url,
+            prefix="dcdata",
         )
-        ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
-        execute_job.after(ray_cluster)
+        ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC)
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") != "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret)
+            ComponentUtils.set_s3_env_vars_to_component(
+                execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata"
+            )
+        execute_data_cleaning_job.after(compute_data_cleaning_exec_params)
 
 
 if __name__ == "__main__":
     # Compiling the pipeline
-    compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml"))
+    compiler.Compiler().compile(fuzzydedup, __file__.replace(".py", ".yaml"))
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
index 726200339..cd3a58b99 100644
--- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
+++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
@@ -10,10 +10,79 @@
 # limitations under the License.
 ################################################################################
 
+from typing import Any, Dict, NamedTuple
 
-def fdedup_compute_execution_params(
+
+def compute_common_params(
     worker_options: dict,  # ray worker configuration
-    actor_options: dict,  # actor's resource requirements
+    data_s3_config: str,  # S3 configuration
+    num_permutations: int,  # number of permutations (minhashes) per document
+    n_samples: int,  # files to sample for number of documents estimation
+) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]):
+
+    import sys
+
+    from data_processing.data_access import DataAccessS3
+    from data_processing.utils import GB
+    from runtime_utils import KFPUtils
+
+    # get credentials
+    s3_key, s3_secret, s3_endpoint = KFPUtils.credentials()
+    s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint}
+    s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"'))
+    # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly
+    data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1)
+    # sample input data
+    sampling: dict[str, Any]
+    sampling, _ = data_access.sample_input_data(n_samples=n_samples)
+    number_of_docs = int(sampling.get("estimated number of docs"))
+    if number_of_docs == 0:
+        print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
+        sys.exit(1)
+    print(f"Estimated number of docs: {number_of_docs}")
+    # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where:
+    #   8 bytes are taken by the band hash
+    #   (num_permutations * 4) bytes are taken by the min hashes
+    #   20 bytes to provide some extra space for storage in a table
+    # The total amount of space needed by a band is number_of_docs * doc_bytes.
+    # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB
+    doc_bytes = 8 + num_permutations * 4 + 20
+    band_bytes = number_of_docs * doc_bytes
+    num_segments = 1 + (band_bytes // (3 * GB))
+    print(f"Number of segments: {num_segments}")
+
+    # To process data efficiently, each actor needs 16GB of memory.
+    # The actor config controls CPU allocation, not memory;
+    # use CPU allocation s.t. the number of actors on a worker  provides access to 16GB of memory for each actor.
+    # Also, to keep S3 utilization in check, limit the number of actors to 2000
+    num_nodes = worker_options["replicas"]
+    cpu_per_node = worker_options["cpu"] - 1
+    memory_per_node = worker_options["memory"]
+
+    memory_per_actor = 16  # GB
+    max_num_actors = 2000
+    num_actors_per_node: int = int(memory_per_node / memory_per_actor)
+    if num_actors_per_node == 0:
+        num_actors_per_node = 1
+    # never run actors on the head node, so (n - 1) nodes to run actors
+    num_actors = (num_nodes - 1) * num_actors_per_node
+
+    while num_actors > max_num_actors:
+        num_actors -= num_nodes - 1
+        num_actors_per_node -= 1
+    print(f"Number of actors per node = {num_actors_per_node}")
+    cpus_per_actor = cpu_per_node / num_actors_per_node
+    print(f"CPUs per actor = {cpus_per_actor}")
+
+    from collections import namedtuple
+
+    fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"])
+    return fdedup_params(num_segments, num_actors, cpus_per_actor)
+
+
+def signature_calc_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
     data_s3_config: str,  # s3 configuration
     data_max_files: int,  # max files to process
     data_num_samples: int,  # num samples to process
@@ -22,27 +91,20 @@ def fdedup_compute_execution_params(
     runtime_code_location: dict,  # code location
     doc_column: str,  # document column name
     id_column: str,  # integer document id column name
-    cluster_column: str,  # cluster column name
-    bucket_cpu: float,  # number of CPUs per bucket hash
-    doc_cpu: float,  # number of CPUs per doc hash
-    mhash_cpu: float,  # number of CPUs per minhash hash
     num_permutations: int,  # number of permutations
+    num_bands: int,  # number of bands
+    num_minhashes_per_band: int,  # band length
+    word_shingle_size: int,  # number of words in shingle
+    shingle_option: str,  # type of shingle, one of 'word' or 'char'
     threshold: float,  # threshold,
-    shingles_size: int,  # number of words in shingle
-    delimiters: str,  # delimiter for splitting document
-    random_delay_limit: int,  # delay between reads to reduce S3 load.
-    # A random number between 0 and random_delay_limit is used
-    snapshot_delay: int,  # delay between restoring individual actors
-    use_doc_snapshot: bool,  # flag to skip documents building and start from existing snapshots
-    use_bucket_snapshot: bool,  # flag to skip buckets building and start from existing snapshots
-    n_samples: int,  # number of samples to use
-) -> dict:  # NamedTuple(
-    # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)]
+    num_segments: int,  # number of segments
+    seed: int,  # seed for the random number generator
+) -> dict:
 
     """
-    Compute fuzzy dedup execution parameters
-    :param worker_options: cluster parameters
-    :param actor_options: actor request requirements
+    Compute fuzzy dedup execution parameters for signature calculation
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
     :param data_s3_config: s3 configuration
     :param data_max_files: max files to process
     :param data_num_samples: num samples to process
@@ -51,182 +113,202 @@ def fdedup_compute_execution_params(
     :param runtime_code_location: code location
     :param doc_column: document column name
     :param id_column: integer document id column name
-    :param cluster_column: cluster column name
-    :param bucket_cpu: number of CPUs per bucket hash
-    :param doc_cpu: number of CPUs per doc hash
-    :param mhash_cpu: number of CPUs per minhash hash
     :param num_permutations: number of permutations
+    :param num_bands: number of bands
+    :param num_minhashes_per_band: band length
+    :param word_shingle_size: number of words in shingle
+    :param shingle_option: str: type of shingle, one of 'word' or 'char'
     :param threshold: threshold,
-    :param shingles_size: number of words in shingle
-    :param delimiters: delimiter for splitting document
-    :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used
-    :param snapshot_delay: delay between restoring individual actors
-    :param use_doc_snapshot: flag to skip documents building and start from existing snapshots
-    :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots
-    :param n_samples: number of samples to use
+    :param num_segments: number of segments
+    :param seed: seed for the random number generator
     :return: a dictionary with a Ray Job execution parameters
     """
-    import math
-    import sys
 
-    from data_processing.data_access import DataAccessS3
-    from data_processing.utils import GB, KB
-    from runtime_utils import KFPUtils
-    from scipy.integrate import quad as integrate
-
-    EXECUTION_OF_KB_DOC = 0.003
-
-    def fuzzy_optimal_param(
-        threshold: float,
-        num_perm: int,
-        false_positive_weight: float,
-        false_negative_weight: float,
-    ) -> tuple[int, int]:
-        """
-        Computes parameters for fuzzy dedup
-        :param threshold: filtering threshold
-        :param num_perm: number of permutations
-        :param false_positive_weight: false positive weight
-        :param false_negative_weight: false negative weight
-        :return: number of buckets and bucket length
-        """
-
-        def _false_positive_probability(ths: float, b: int, r: int) -> float:
-            """
-            Compute false positive probability
-            :param ths: filtering threshold
-            :param b: permutation
-            :param r: rel permutation
-            :return: probability
-            """
-            _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
-            a, err = integrate(_probability, 0.0, ths)
-            return a
-
-        def _false_negative_probability(ths: float, b: int, r: int) -> float:
-            """
-            Compute false negative probability
-            :param ths: filtering threshold
-            :param b: permutation
-            :param r: rel permutation
-            :return: probability
-            """
-            _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
-            a, err = integrate(_probability, ths, 1.0)
-            return a
-
-        min_error = float("inf")
-        opt = (0, 0)
-        for perm in range(1, num_perm + 1):
-            max_r = int(num_perm / perm)
-            for rel in range(1, max_r + 1):
-                fp = _false_positive_probability(threshold, perm, rel)
-                fn = _false_negative_probability(threshold, perm, rel)
-                error = fp * false_positive_weight + fn * false_negative_weight
-                if error < min_error:
-                    min_error = error
-                    opt = (perm, rel)
-        return opt
+    # fuzzy parameters for signature calculation
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    print(f"runtime_actor_options = {runtime_actor_options}")
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "minhash_contents_column": doc_column,
+        "minhash_document_id_column": id_column,
+        "minhash_num_permutations": num_permutations,
+        "minhash_num_bands": num_bands,
+        "minhash_num_minhashes_per_band": num_minhashes_per_band,
+        "minhash_word_shingle_size": word_shingle_size,
+        "minhash_shingle_option": shingle_option,
+        "minhash_jaccard_similarity_threshold": threshold,
+        "minhash_num_segments": num_segments,
+        "minhash_seed": seed,
+        "scdata_s3_config": data_s3_config,
+    }
+
+
+def cluster_analysis_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
+    data_s3_config: str,  # s3 configuration
+    data_max_files: int,  # max files to process
+    data_num_samples: int,  # num samples to process
+    runtime_pipeline_id: str,  # pipeline id
+    runtime_job_id: str,  # job id
+    runtime_code_location: dict,  # code location
+    num_bands: int,  # number of bands
+    threshold: float,  # threshold,
+    num_segments: int,  # number of segments
+) -> dict:
+
+    """
+    Compute fuzzy dedup execution parameters for cluster analysis
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
+    :param data_s3_config: s3 configuration
+    :param data_max_files: max files to process
+    :param data_num_samples: num samples to process
+    :param runtime_pipeline_id: pipeline id
+    :param runtime_job_id: job id
+    :param runtime_code_location: code location
+    :param num_bands: number of bands
+    :param threshold: threshold,
+    :param num_segments: number of segments
+    :return: a dictionary with a Ray Job execution parameters
+    """
+    import json
+    import os
 
     # fuzzy parameters
-    num_buckets, length_bucket = fuzzy_optimal_param(
-        threshold=threshold,
-        num_perm=num_permutations,
-        false_positive_weight=0.5,
-        false_negative_weight=0.5,
-    )
-    print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}")
     # Get cluster parameters
-    cluster_cpu = worker_options["replicas"] * worker_options["cpu"]
-    cluster_memory = worker_options["replicas"] * worker_options["memory"]
-    print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}")
-    cluster_cpu -= 1
-    cluster_memory *= 0.85
-    # get actor requirements
-    actor_cpu = actor_options["num_cpus"]
-    print(f"actor required cpu {actor_cpu}")
-    # get credentials
-    s3_key, s3_secret, s3_endpoint = KFPUtils.credentials()
-    s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint}
-    s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"'))
-    if type(s3_config) is list:
-        # S3 config is list. take the first element
-        s3_config = s3_config[0]
-    # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly
-    data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1)
-    # sample input data
-    sampling, _ = data_access.sample_input_data(n_samples=n_samples)
-    avg_doc_size = sampling.get("average doc size KB")
-    number_of_docs = sampling.get("estimated number of docs")
-    avg_table_size = sampling.get("average table size MB") / KB
-    if number_of_docs == 0:
-        print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
-        sys.exit(1)
-    # we are creating more buckets actors, so that we get better parallelization for bucket processing
-    b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB)
-    d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB)
-    m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB)
-    # compute cpu requirements
-    # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount
-    # of CPUs
-    n_preprocessors = int(
-        (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu
-    )
-    if n_preprocessors <= 0:
-        print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}")
-        print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}")
-        print("Try to increase the size of the cluster")
-        sys.exit(1)
-    # compute the amount of workers
-    n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu)
-    # Ensure that we do not overwhelm S3
-    if n_workers > 2000:
-        n_workers = 2000
-    print(
-        f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, "
-        f"minhash actors {m_actors}, document actors {d_actors}"
-    )
-
-    # Make sure that we have enough memory
-    r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors)
-    print(f"Required execution memory {r_mem} GB")
-    if r_mem > cluster_memory:
-        print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}")
-        print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})")
-        sys.exit(1)
+    data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
+    base_folder = data_s3_config_dict.get("output_folder")
+    data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands")
+    data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove")
+    data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "cluster_num_bands": num_bands,
+        "cluster_jaccard_similarity_threshold": threshold,
+        "cluster_num_segments": num_segments,
+    }
 
-    print(
-        f"Required cpu : "
-        f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}"
-    )
 
-    projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60
-    print(f"Projected execution time {projected_execution} min")
+def get_duplicate_list_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
+    data_s3_config: str,  # s3 configuration
+    data_max_files: int,  # max files to process
+    data_num_samples: int,  # num samples to process
+    runtime_pipeline_id: str,  # pipeline id
+    runtime_job_id: str,  # job id
+    runtime_code_location: dict,  # code location
+    duplicate_docids_folder: str,  # folder with the docs IDs to remove
+    duplicate_list_location: str,  # location of the list of duplicate doc ids
+) -> dict:
+    """
+    Compute fuzzy dedup execution parameters for get duplicate list step
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
+    :param data_s3_config: s3 configuration
+    :param data_max_files: max files to process
+    :param data_num_samples: num samples to process
+    :param runtime_pipeline_id: pipeline id
+    :param runtime_job_id: job id
+    :param runtime_code_location: code location
+    :param duplicate_docids_folder: folder with the docs IDs to remove
+    :param duplicate_list_location: location of the list of duplicate doc ids
+    :return: a dictionary with a Ray Job execution parameters
+    """
+    import json
+
+    # fuzzy parameters
+    # Get cluster parameters
+    data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
+    base_folder = data_s3_config_dict.get("output_folder")
+    data_s3_config_dict["input_folder"] = base_folder
+    data_s3_config_dict["output_folder"] = base_folder
+    data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "fdlist_docs_to_remove": duplicate_docids_folder,
+        "fdlist_consolidated_filename": duplicate_list_location,
+    }
+
+
+def data_cleaning_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
+    data_s3_config: str,  # s3 configuration
+    data_max_files: int,  # max files to process
+    data_num_samples: int,  # num samples to process
+    runtime_pipeline_id: str,  # pipeline id
+    runtime_job_id: str,  # job id
+    runtime_code_location: dict,  # code location
+    id_column: str,  # integer document id column name
+    duplicate_list_location: str,  # location of the list of duplicate doc ids
+    operation_mode: str,  # filter (non-)duplicates or annotate
+) -> dict:
+    """
+    Compute fuzzy dedup execution parameters
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
+    :param data_s3_config: s3 configuration
+    :param data_max_files: max files to process
+    :param data_num_samples: num samples to process
+    :param runtime_pipeline_id: pipeline id
+    :param runtime_job_id: job id
+    :param runtime_code_location: code location
+    :param id_column: integer document id column name
+    :param duplicate_list_location: location of the list of duplicate doc ids
+    :param operation_mode: filter (non-)duplicates or annotate
+    :return: a dictionary with a Ray Job execution parameters
+    """
+    import json
+    import os
+
+    # fuzzy parameters
+    # Get cluster parameters
+    data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
+    base_folder = data_s3_config_dict.get("output_folder")
+    if operation_mode == "filter_duplicates":
+        output_subfolder = "cleaned"
+    elif operation_mode == "filter_non_duplicates":
+        output_subfolder = "duplicates"
+    else:  # operation_mode == "annotate"
+        output_subfolder = "annotated"
+    data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder)
+    data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
         "data_num_samples": data_num_samples,
-        "runtime_num_workers": n_workers,
-        "runtime_worker_options": str(actor_options),
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
-        "fdedup_doc_column": doc_column,
-        "fdedup_id_column": id_column,
-        "fdedup_cluster_column": cluster_column,
-        "fdedup_bucket_cpu": bucket_cpu,
-        "fdedup_doc_cpu": doc_cpu,
-        "fdedup_mhash_cpu": mhash_cpu,
-        "fdedup_num_doc_actors": d_actors,
-        "fdedup_num_bucket_actors": b_actors,
-        "fdedup_num_minhash_actors": m_actors,
-        "fdedup_num_preprocessors": n_preprocessors,
-        "fdedup_num_permutations": num_permutations,
-        "fdedup_threshold": threshold,
-        "fdedup_shingles_size": shingles_size,
-        "fdedup_delimiters": delimiters,
-        "fdedup_random_delay_limit": random_delay_limit,
-        "fdedup_snapshot_delay": snapshot_delay,
-        "fdedup_use_doc_snapshot": use_doc_snapshot,
-        "fdedup_use_bucket_snapshot": use_bucket_snapshot,
+        "fdclean_document_id_column": id_column,
+        "fdclean_duplicate_list_location": duplicate_list_location,
+        "fdclean_operation_mode": operation_mode,
     }
diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore
new file mode 100644
index 000000000..f7275bbbd
--- /dev/null
+++ b/transforms/universal/fdedup/python/.dockerignore
@@ -0,0 +1 @@
+venv/
diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
new file mode 100644
index 000000000..071478870
--- /dev/null
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -0,0 +1,44 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-dist data-processing-dist
+RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
+
+COPY --chown=dpk:root src/ src/
+COPY --chown=dpk:root pyproject.toml pyproject.toml
+COPY --chown=dpk:root README.md README.md
+COPY --chown=dpk:root requirements.txt requirements.txt
+
+RUN pip install --no-cache-dir -e .
+
+# copy source data
+COPY src/ src/
+
+# copy source data
+COPY ./src/fdedup_transform_python.py fdedup_transform_python.py
+COPY ./src/fdedup_transform_python.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile
new file mode 100644
index 000000000..05f6bf5ca
--- /dev/null
+++ b/transforms/universal/fdedup/python/Makefile
@@ -0,0 +1,64 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+
+# Set this, before including .make.defaults, to 
+#   1 if requirements reference the latest code in the data processing library 
+#     in this repo (that is not yet published to pypi).	 This is the default setting.
+#   0 if the transforms DPK dependencies are on wheels published to 
+#     pypi (e.g. data-prep-toolkit=0.2.1)
+#USE_REPO_LIB_SRC=1
+
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+include $(REPOROOT)/transforms/.make.transforms
+
+# Include the common configuration for this transform
+include ../transform.config
+
+venv::	.transforms.python-venv
+
+test::	.transforms.python-test
+
+clean:: .transforms.clean
+
+image:: .transforms.python-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions
+        
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.python-test-image
+
+run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample: .transforms.run-local-python-sample
+
+#run-s3-ray-sample: .transforms.run-s3-ray-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
new file mode 100644
index 000000000..34f18c73b
--- /dev/null
+++ b/transforms/universal/fdedup/python/README.md
@@ -0,0 +1,11 @@
+# Fuzzy Dedup
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+
+The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see
+[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details.
\ No newline at end of file
diff --git a/data-processing-lib/spark/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
similarity index 54%
rename from data-processing-lib/spark/pyproject.toml
rename to transforms/universal/fdedup/python/pyproject.toml
index 89b4d9bf8..97be33d54 100644
--- a/data-processing-lib/spark/pyproject.toml
+++ b/transforms/universal/fdedup/python/pyproject.toml
@@ -1,31 +1,21 @@
 [project]
-name = "data_prep_toolkit_spark"
+name = "dpk_fdedup_transform_python"
 version = "0.2.2.dev2"
-keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
-description = "Data Preparation Toolkit Library for Spark"
+description = "Fuzzy Dedup Transform for Python"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [
-    { name = "David Wood", email = "dawood@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
-dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
-    "pyspark>=3.5.2",
-    "psutil>=6.0.0",
-    "PyYAML>=6.0.2"
-]
-
-[project_urls]
-Repository = "https://github.com/IBM/data-prep-kit"
-Issues = "https://github.com/IBM/data-prep-kit/issues"
-Documentation = "https://ibm.github.io/data-prep-kit/"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"
+dynamic = ["dependencies"]
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
 build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
 
 [project.optional-dependencies]
 dev = [
@@ -44,7 +34,7 @@ dev = [
 package_dir = ["src","test"]
 
 [options.packages.find]
-where = ["src/data_processing_spark"]
+where = ["src/"]
 
 [tool.pytest.ini_options]
 # Currently we use low coverage since we have to run tests separately (see makefile)
diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt
new file mode 100644
index 000000000..4e69a72e4
--- /dev/null
+++ b/transforms/universal/fdedup/python/requirements.txt
@@ -0,0 +1,10 @@
+data-prep-toolkit==0.2.2.dev2
+pyyaml>=6.0.2
+boto3>=1.34.69
+kubernetes>=30.1.0
+polars==1.9.0
+disjoint-set>=0.8.0
+scipy>=1.14.1, <2.0.0
+numpy<1.29.0
+sentencepiece>=0.2.0
+mmh3>=4.1.0
diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py
new file mode 100644
index 000000000..03d5047ea
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/Murmur_MH.py
@@ -0,0 +1,112 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+
+import logging
+import os
+from typing import List, Set
+
+import mmh3
+import numpy as np
+
+
+class Murmur_MH:
+    def __init__(self, num_perm=64, seed=42, hashfunc=None):
+        self.seed = seed
+        self.num_perm = num_perm  # the number of buckets, i.e. the vector length after self.minhash() call
+        self.permutations = self._init_permutations(seed, num_perm)
+
+    def _init_permutations(self, seed, num_perm):
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        max_int = np.uint64((1 << 64) - 1)
+        # initialize pseudo random number generator with given seed value
+        gen = np.random.RandomState(seed)
+        # get self.num_perm pseudo random numbers between 2 and max_int (excl)
+        permutations = np.array(
+            [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)],
+            dtype=np.uint64,
+        ).T
+        # make all even pseudo random numbers odd by adding 1
+        permutations[permutations % 2 == 0] += 1
+        return permutations
+
+    def minhash(self, shingles: List[str]):
+        """return np.array of minhash"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+        )
+
+    def minhash2(self, shingles: List[str], doc_len: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0),
+            doc_len,
+        )
+
+    def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+            .tolist(),
+            doc_len,
+            doc_id,
+        )
+
+    @staticmethod
+    def jaccard(mh1: np.array, mh2: np.array) -> float:
+        """
+        The Jaccard similarity measures the similarity between two sets of data
+        to see which members are shared and distinct.
+
+        The Jaccard similarity is calculated by dividing the number of observations
+        in both sets by the number of observations in either set.
+
+        Developed by Paul Jaccard, the index ranges from 0 to 1.
+        The closer to 1, the more similar the two sets of data.
+
+        As a document is represented by a set. We use Jaccard distance to see how similar between two documents.
+        """
+        assert len(mh1) == len(mh2)
+        return np.count_nonzero(mh1 == mh2) / len(mh1)
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
new file mode 100644
index 000000000..bb785021c
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -0,0 +1,50 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+
+# create parameters
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "cluster_num_bands": 14,
+    "cluster_num_segments": 2,
+    "cluster_jaccard_similarity_threshold": 0.7,
+}
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+    # Launch python to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
new file mode 100644
index 000000000..a9822babe
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -0,0 +1,336 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import io
+import os
+import re
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Tuple
+
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.transform import AbstractFolderTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
+from Murmur_MH import Murmur_MH
+
+
+short_name = "cluster"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+num_bands_key = "num_bands"
+""" This key holds the number of bands used in the banding technique"""
+num_segments_key = "num_segments"
+""" This key holds the number of segments dividing the hashing space for each band"""
+jaccard_similarity_threshold_key = "jaccard_similarity_threshold"
+""" This key holds the Jaccard similarity threshold above which two documents are duplicates"""
+sort_output_key = "sort_output"
+""" This key is used to sort"""
+
+# command line arguments
+num_bands_cli_param = f"{cli_prefix}{num_bands_key}"
+""" The number of bands used in the banding technique"""
+jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}"
+""" Jaccard similarity threshold above which two documents are duplicates"""
+num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
+""" The number of segments dividing the hashing space for each band"""
+sort_output_cli_param = f"{cli_prefix}{sort_output_key}"
+""" Sort the output"""
+
+captured_arg_keys = [
+    num_bands_key,
+    num_segments_key,
+    jaccard_similarity_threshold_key,
+    sort_output_key,
+]
+
+# defaults
+num_bands_default = 14
+""" Default number of bands used in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+jaccard_similarity_threshold_default = 0.75
+""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_segments_default = 1
+""" Default number of segments dividing the hashing space for each band"""
+sort_output_default = False
+
+
+class ClusterAnalysisTransform(AbstractFolderTransform):
+    """
+    This is the second transform of the fuzzy dedup pipeline. It runs in parallel:
+    for each band, the hashing interval is divided into segments. A cluster analysis
+    uses as input all the parquet files from segment of a band. The `bands` output
+    of the signature calculation, the first transform in the fuzzy dedup pipeline
+    contains all the data for a given segment s of a specific band b in the
+    subfolder `bands/band=b/segment=s`.
+    The transform loads all the parquet files in the `bands/band=b/segment=s`
+    subfolder. Each one of these parquet files has two columns: the `band_hash`
+    and a `data` structure, which includes the `document_id`, the `minhashes` and
+    the `document_size` fields. Once all the files have been loaded in a single
+    dataframe, a `group_by` operation on the `band_hash` field is performed in
+    that dataframe. All the documents that have the same band_hash are grouped
+    in a cluster. Subsequently, the documents of each cluster are sorted in
+    descending order according to their size, and a Jaccard similarity is
+    calculated between the cluster documents. The documents for which the Jaccard
+    similarity is above the `jaccard_similarity_threshold` remain in the cluster,
+    the others are removed from the cluster. Finally, from each cluster that has
+    more than one document after running the Jaccard similarity, we select a doc
+    to keep (the largest size document), and mark the other documents as
+    duplicates. The resulting clusters are saved in a file for further analysis.
+
+    Args:
+        num_bands: number of bands used in the banding technique
+        jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
+        num_segments: the number of segments dividing the hashing space for each band
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments
+        defined by the companion runtime, ClusterAnalysisTransformRuntime.
+        """
+        super().__init__(config)
+        self.num_bands = config.get(num_bands_key, num_bands_default)
+        self.num_segments = config.get(num_segments_key, num_segments_default)
+        self.jaccard_similarity_threshold = config.get(
+            jaccard_similarity_threshold_key, jaccard_similarity_threshold_default
+        )
+        self.sort_output = config.get(sort_output_key, sort_output_default)
+        self.data_access = config.get("data_access")
+        self.logger = get_logger(__name__)
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        self.logger.info(f"Cluster analysis for folder {folder_name}")
+        metadata = {}
+        input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name))
+        files, retries = self.data_access.get_folder_files(
+            path=input_folder,
+            extensions=[".parquet"],
+            return_data=True,
+        )
+        if retries > 0:
+            metadata |= {"data_access_retries": retries}
+        match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name)
+        if match:
+            band = int(match.group(1))
+            segment = int(match.group(2))
+        else:
+            raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s")
+        output_folder = self.sanitize_folder_name(self.data_access.output_folder)
+        output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet")
+
+        # consolidate into a single data frame band hashes computed by workers
+        band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files)
+        metadata |= consolidation_stats
+        # cluster grouping by band hashes
+        cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe)
+        metadata |= cluster_stats
+        # cluster analysis using jaccard similarity
+        jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe)
+        metadata |= jaccard_stats
+        # Generate the docs_to_remove dataframe
+        docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove")
+        output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow())
+        self.logger.debug(f"{len(docs_to_remove_dataframe)} documents marked to remove")
+        metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)}
+        return [(output_data, output_path)], metadata
+
+    def sanitize_folder_name(self, folder_name: str) -> str:
+        if "://" in folder_name:
+            _, folder_name = folder_name.split("://")
+        if folder_name[-1] != "/":
+            folder_name = f"{folder_name}/"
+        return folder_name
+
+    def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
+        band_segment_dataframe = pl.DataFrame()
+        total_input_rows = 0
+        for fname, contents in files.items():
+            df = pl.read_parquet(io.BytesIO(contents))
+            total_input_rows += len(df)
+            self.logger.debug(f"{fname} has {len(df)} rows")
+            band_segment_dataframe = band_segment_dataframe.vstack(df)
+
+        consolidation_stats = {
+            "input_files": len(files),
+            "input_bytes": sum(len(v) for v in files.values()),
+            "input_rows": total_input_rows,
+            "consolidated_files": 1,
+            "consolidated_bytes": band_segment_dataframe.to_arrow().nbytes,
+            "consolidated_rows": len(band_segment_dataframe),
+        }
+        return band_segment_dataframe, consolidation_stats
+
+    def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
+        groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data")
+        cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter(
+            pl.col("cluster_length") > 1
+        )
+        # self.logger.info(f"file_name = {file_name}")
+        num_clusters = len(cluster_dataframe)
+        if num_clusters > 0:
+            sum_cdocs = cluster_dataframe.select(pl.sum("cluster_length")).item()
+            max_cdocs = cluster_dataframe.select(pl.max("cluster_length")).item()
+            min_cdocs = cluster_dataframe.select(pl.min("cluster_length")).item()
+            avg_cdocs = cluster_dataframe.select(pl.mean("cluster_length")).item()
+        else:
+            sum_cdocs = 0
+            max_cdocs = 0
+            min_cdocs = 0
+            avg_cdocs = 0
+        self.logger.debug(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        cluster_stats = {
+            "groupby_clusters": num_clusters,
+            "cluster_duplicate_docs": sum_cdocs,
+        }
+        return cluster_dataframe, cluster_stats
+
+    def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
+        # Define the schema with specific data types
+        schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64}
+        doc_ids_lists = []
+        docs_to_remove_lists = []
+        len_of_docs2remove_lists = []
+        for row in df.iter_rows(named=True):
+            doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row)
+            doc_ids_lists += doc_ids_list
+            docs_to_remove_lists += docs_to_remove_list
+            len_of_docs2remove_lists += len_of_docs2remove_list
+        jaccard_cluster_dataframe = pl.DataFrame(
+            {
+                "first_doc": doc_ids_lists,
+                "docs_to_remove": docs_to_remove_lists,
+                "docs_to_remove_length": len_of_docs2remove_lists,
+            },
+            schema=schema,
+        )
+        filtered_jaccard_dataframe = jaccard_cluster_dataframe.filter(pl.col("docs_to_remove_length") > 0)
+        num_clusters = len(filtered_jaccard_dataframe)
+        if num_clusters > 0:
+            sum_cdocs = filtered_jaccard_dataframe.select(pl.sum("docs_to_remove_length")).item()
+            max_cdocs = filtered_jaccard_dataframe.select(pl.max("docs_to_remove_length")).item()
+            min_cdocs = filtered_jaccard_dataframe.select(pl.min("docs_to_remove_length")).item()
+            avg_cdocs = filtered_jaccard_dataframe.select(pl.mean("docs_to_remove_length")).item()
+        else:
+            sum_cdocs = 0
+            max_cdocs = 0
+            min_cdocs = 0
+            avg_cdocs = 0
+        self.logger.debug(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        jaccard_stats = {
+            "jaccard_clusters": num_clusters,
+            "jaccard_duplicate_docs": sum_cdocs,
+        }
+        if self.sort_output:
+            filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc")
+        return filtered_jaccard_dataframe, jaccard_stats
+
+    def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
+        # Process row and return a new list of Series or a new row
+        threshold = self.jaccard_similarity_threshold
+        doc_ids_list = []
+        docs_to_remove_list = []
+        len_of_docs2remove_list = []
+        # sort documents
+        document_data = row["document_data"]
+
+        # Sort the list by 'document_length'
+        sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"]))
+
+        # Extracting int_id_column values into a list
+        doc_list = [item["int_id_column"] for item in sorted_document_data]
+
+        # Creating a dictionary with int_id_column as key and minhashes as value
+        doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data}
+
+        while len(doc_list) > 1:
+            docs_to_remove = []
+            new_doc_list = []
+            # this is the document we are going to keep
+            first_doc = doc_list[0]
+            first_mh = doc_minhashes[first_doc]
+            for int_id_column in doc_list[1:]:
+                doc_mh = doc_minhashes[int_id_column]
+                distance = Murmur_MH.jaccard(np.array(first_mh), np.array(doc_mh))
+                if distance >= threshold:
+                    docs_to_remove.append(int_id_column)
+                else:
+                    new_doc_list.append(int_id_column)
+            if len(docs_to_remove) > 0:
+                docs_to_remove = list(set(docs_to_remove))
+                doc_ids_list.append(first_doc)
+                docs_to_remove_list.append(docs_to_remove)
+                len_of_docs2remove_list.append(len(docs_to_remove))
+            doc_list = new_doc_list
+
+        return doc_ids_list, docs_to_remove_list, len_of_docs2remove_list
+
+
+class ClusterAnalysisTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=ClusterAnalysisTransform,
+            remove_from_metadata=[],
+        )
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given  parser.
+        This will be included in a dictionary used to initialize the NOOPTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{jaccard_similarity_threshold_cli_param}",
+            type=float,
+            default=jaccard_similarity_threshold_default,
+            help="Jaccard similarity threshold above which two documents are duplicates",
+        )
+        parser.add_argument(
+            f"--{num_bands_cli_param}",
+            type=int,
+            default=num_bands_default,
+            help="The number of bands used in the banding technique",
+        )
+        parser.add_argument(
+            f"--{num_segments_cli_param}",
+            type=int,
+            default=num_segments_default,
+            help="The number of segments dividing the hashing space for each band",
+        )
+        parser.add_argument(
+            f"--{sort_output_cli_param}",
+            type=bool,
+            default=sort_output_default,
+            help="Sort",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        return True
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
new file mode 100644
index 000000000..c35c5a711
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
@@ -0,0 +1,76 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import time
+from typing import Any
+
+from cluster_analysis_transform import (
+    ClusterAnalysisTransformConfiguration,
+    num_bands_key,
+    num_segments_key,
+)
+from data_processing.data_access import DataAccess
+from data_processing.runtime.pure_python import (
+    DefaultPythonTransformRuntime,
+    PythonTransformLauncher,
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.utils import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class ClusterAnalysisPythonRuntime(DefaultPythonTransformRuntime):
+    """
+    Cluster analysis runtime support for Python
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        bands = self.params[num_bands_key]
+        segments = self.params[num_segments_key]
+        folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
+        return folders
+
+
+class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for Fuzzy Dedup ClusterAnalysis
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(
+            transform_config=ClusterAnalysisTransformConfiguration(),
+            runtime_class=ClusterAnalysisPythonRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+    logger.info("Launching fuzzy dedup cluster analysis python transform")
+    # Launch python to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
new file mode 100644
index 000000000..aa4aabb90
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
@@ -0,0 +1,60 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+duplicate_location = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "test-data",
+        "expected",
+        "docs_to_remove_consolidated",
+        "docs_to_remove_consolidated.parquet",
+    )
+)
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    document_id_column_cli_param: "int_id_column",
+    duplicate_list_location_cli_param: duplicate_location,
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
new file mode 100644
index 000000000..74597068c
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -0,0 +1,179 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import io
+import os
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Tuple
+
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.data_access import DataAccessFactory
+from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
+
+
+short_name = "fdclean"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+document_id_column_key = "document_id_column"
+""" This key holds the name of the column storing the unique ID assigned to each document"""
+duplicate_list_location_key = "duplicate_list_location"
+""" This key holds the location of the list of duplicate documents marked for removal"""
+operation_mode_key = "operation_mode"
+""" This key holds the operation mode: 'filter_duplicates', 'filter_non_duplicates', or 'annotate'"""
+
+# command line arguments
+document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
+""" Name of the column storing the unique ID assigned to each document"""
+duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}"
+""" Location of the list of duplicate documents marked for removal"""
+operation_mode_cli_param = f"{cli_prefix}{operation_mode_key}"
+""" Operation mode, can be one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'"""
+
+captured_arg_keys = [
+    document_id_column_key,
+    duplicate_list_location_key,
+    operation_mode_key,
+]
+
+# defaults
+document_id_column_default = "int_id_column"
+""" Default name of the column storing the unique ID assigned to each document"""
+duplicate_list_location_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
+""" Default location of the list of duplicate documents marked for removal"""
+operation_mode_default = "filter_duplicates"
+""" Default value for operation mode, will filter out all the duplicate documents"""
+
+dataclean_data_factory_key = "dc_data_factory"
+dataclean_data_access_key = "dc_data_access"
+
+
+class DataCleaningTransform(AbstractTableTransform):
+    """
+    This is the third transform of the fuzzy dedup pipeline. It takes as input
+    the list of the documents to remove (identified as duplicates during the
+    cluster analysis phase, and the original dataset. Each dataset file is
+    imported into a table, and the documents that are in the documents to remove
+    list are filtered out from that table. The output is a new dataset, which
+    keeps the directory structure of the input dataset, but has all the fuzzy
+    duplicates removed.
+
+    Args:
+        duplicate_location: location (local or s3) of the duplicate document list
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments
+        defined by the companion runtime, ClusterAnalysisTransformRuntime.
+        """
+        super().__init__(config)
+        self.logger = get_logger(__name__)
+        self.document_id_column = config.get(document_id_column_key, document_id_column_default)
+        self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default)
+        self.operation_mode = config.get(operation_mode_key, operation_mode_default)
+        contents = config.get("df")
+        self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents))
+        self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows")
+        self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column})
+
+    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
+        self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}")
+        input_df = pl.from_arrow(table)
+        # handle the case when the doc_id columns in the input dataframe and the
+        # docs_to_remove_df  have different types, i.e. one is int32 and the
+        # other is int64
+        input_doc_id_type = input_df[self.document_id_column].dtype
+        if input_doc_id_type != self.docs_to_remove_df[self.document_id_column].dtype:
+            self.docs_to_remove_df = self.docs_to_remove_df.select(
+                pl.col(self.document_id_column).cast(input_doc_id_type)
+            )
+        if self.operation_mode == "filter_duplicates":
+            result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti")
+        elif self.operation_mode == "filter_non_duplicates":
+            result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="inner")
+        else:  # self.operation_mode == "annotation"
+            duplicates_df = self.docs_to_remove_df.with_columns(pl.lit("d").alias("duplicate"))
+            result_df = input_df.join(duplicates_df, on=self.document_id_column, how="left").with_columns(
+                pl.col("duplicate").fill_null("")
+            )
+        result_table = result_df.to_arrow()
+        metadata = {
+            "input_files": 1,
+            "input_docs": table.num_rows,
+            "input_bytes": table.nbytes,
+            "output_files": 1,
+            "output_docs": result_table.num_rows,
+            "output_bytes": result_table.nbytes,
+            "filtered_docs": (table.num_rows - result_table.num_rows),
+            "filtered_bytes": (table.nbytes - result_table.nbytes),
+        }
+        return [result_table], metadata
+
+
+class DataCleaningTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningTransform):
+        super().__init__(
+            name=short_name,
+            transform_class=transform_class,
+            remove_from_metadata=[dataclean_data_factory_key],
+        )
+        self.daf = DataAccessFactory(cli_arg_prefix="dcdata_")
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given parser.
+        This will be included in a dictionary used to initialize the NOOPTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{document_id_column_cli_param}",
+            type=str,
+            default=document_id_column_default,
+            help="name of the column storing the unique ID assigned to each document",
+        )
+        parser.add_argument(
+            f"--{duplicate_list_location_cli_param}",
+            type=str,
+            default=duplicate_list_location_default,
+            help="location of duplicate document list that are marked for removal",
+        )
+        parser.add_argument(
+            f"--{operation_mode_cli_param}",
+            choices=["filter_duplicates", "filter_non_duplicates", "annotate"],
+            default=operation_mode_default,
+            help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents",
+        )
+        self.daf.add_input_params(parser=parser)
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        self.params[dataclean_data_factory_key] = self.daf
+        return self.daf.apply_input_params(args=args)
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
new file mode 100644
index 000000000..edef8b9c5
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
@@ -0,0 +1,103 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+from data_cleaning_transform import (
+    DataCleaningTransformConfiguration,
+    dataclean_data_access_key,
+    dataclean_data_factory_key,
+    duplicate_list_location_default,
+    duplicate_list_location_key,
+)
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.runtime.pure_python.runtime_configuration import (
+    DefaultPythonTransformRuntime,
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.transform import TransformStatistics
+from data_processing.utils import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class DataCleaningPythonRuntime(DefaultPythonTransformRuntime):
+    """
+    Data cleaning runtime support for Python
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_transform_config(
+        self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
+    ) -> dict[str, Any]:
+        """
+        Download the table of duplicate document ids that will be provided to the
+        filtering/annotation method. This is the opportunity for this runtime to
+        create a new set of configuration based on the config/params provided to
+        this instance's initializer. This may include the addition of new
+        configuration data such as ray shared memory, new actors, etc., that
+        might be needed and expected by the transform in its initializer and/or
+        transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :param statistics - reference to statistics actor
+        :param files - list of files to process
+        :return: dictionary of transform init params
+        """
+        data_access = data_access_factory.create_data_access()
+        dc_data_access = self.params.get(dataclean_data_access_key, None)
+        if dc_data_access is None:
+            dc_daf = self.params.get(dataclean_data_factory_key, None)
+            if dc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}")
+            dc_data_access = dc_daf.create_data_access()
+        if dc_data_access.output_folder is None:
+            dc_data_access.output_folder = data_access.output_folder
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        if not duplicate_list_location.startswith("/"):
+            out_paths = dc_data_access.output_folder.rstrip("/").split("/")
+            dupl_list_paths = duplicate_list_location.split("/")
+            paths = out_paths[:-1] + dupl_list_paths
+            duplicate_list_location = "/".join([p.strip("/") for p in paths])
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location)
+        return self.params | {"df": self.duplicate_list}
+
+
+class DataCleaningPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for fuzzy dedup data cleaning step
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param: transform_configuration - transform configuration class
+        :param: runtime_class - name of the runtime configuration class
+        """
+        super().__init__(
+            transform_config=DataCleaningTransformConfiguration(),
+            runtime_class=DataCleaningPythonRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(DataCleaningTransformConfiguration())
+    logger.info("Launching fuzzy dedup data cleaning transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
new file mode 100644
index 000000000..b77f44401
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
@@ -0,0 +1,240 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import argparse
+import ast
+import os
+import sys
+
+import cluster_analysis_transform
+import data_cleaning_transform
+import get_duplicate_list_transform
+import signature_calc_transform
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils, get_logger, str2bool
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+SERVICE_DICT = {
+    "SignatureCalculation": "minhash",
+    "ClusterAnalysis": "cluster",
+    "GetDuplicateList": "fdlist",
+    "DataCleaning": "fdclean",
+}
+
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
+
+ARGS_MAP = {
+    "minhash": signature_calc_transform.captured_arg_keys,
+    "cluster": cluster_analysis_transform.captured_arg_keys,
+    "fdlist": get_duplicate_list_transform.captured_arg_keys,
+    "fdclean": data_cleaning_transform.captured_arg_keys,
+}
+
+
+class ServiceOrchestrator:
+    def __init__(self, global_params: argparse.Namespace = None):
+        self.global_params = global_params
+        self.logger = get_logger(__name__)
+
+    def orchestrate(self):
+        service_list = self.global_params.services.split(",")
+        for service in service_list:
+            self.logger.info(f"Starting {service} step")
+            if service not in SERVICE_DICT:
+                err_msg = f"Unknown service {service} specified. Must be one of {SERVICE_DICT.keys()}"
+                self.logger.error(err_msg)
+                raise ValueError(err_msg)
+            service_short_name = SERVICE_DICT[service]
+            service_params = self.get_arguments(self.global_params, service_short_name)
+            self.logger.info(f"Got parameters for {service}")
+            status = self.execute_service(service_short_name, service_params)
+            if status == 0:
+                self.logger.info(f"{service} completed successfully")
+            else:
+                self.logger.error(f"{service} failed with status {status}, aborting ...")
+                break
+
+    def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list:
+        sys_argv = ["python"]
+        in_args_dict = vars(in_args)
+        all_module_arguments = ARGS_MAP.get(service_name, [])
+        passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None}
+        for k, v in passed_args.items():
+            sys_argv.append(f"--{service_name}_{k}")
+            sys_argv.append(str(v))
+        if service_name == "minhash":
+            input_folder = in_args_dict["input_folder"]
+            output_folder = in_args_dict["output_folder"]
+        elif service_name == "cluster":
+            input_folder = os.path.join(in_args_dict["output_folder"], "bands")
+            output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove")
+        elif service_name == "fdlist":
+            input_folder = in_args_dict["output_folder"]
+            output_folder = in_args_dict["output_folder"]
+        elif service_name == "fdclean":
+            input_folder = in_args_dict["input_folder"]
+            operation_mode = in_args_dict.get("operation_mode", "filter_duplicates")
+            if operation_mode == "filter_duplicates":
+                output_subfolder = "cleaned"
+            elif operation_mode == "filter_non_duplicates":
+                output_subfolder = "duplicates"
+            else:  # operation_mode == "annotate"
+                output_subfolder = "annotated"
+            output_folder = os.path.join(in_args_dict["output_folder"], output_subfolder)
+        else:
+            self.logger.error(f"Unknown service name: {service_name}")
+        data_io = {
+            "input_folder": input_folder,
+            "output_folder": output_folder,
+        }
+        if in_args.use_s3:
+            if in_args.s3_cred is not None:
+                s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred)
+                sys_argv.append("--data_s3_cred")
+                sys_argv.append(s3_cred_ast)
+            elif (
+                s3_creds.get("access_key") is not None
+                and s3_creds.get("secret_key") is not None
+                and s3_creds.get("url") is not None
+            ):
+                sys_argv.append("--data_s3_cred")
+                sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
+            sys_argv.append("--data_s3_config")
+        else:
+            sys_argv.append("--data_local_config")
+        sys_argv.append(ParamsUtils.convert_to_ast(data_io))
+        return sys_argv
+
+    def execute_service(self, service_short_name: str, params: list) -> int:
+        sys.argv = params
+        if service_short_name == "minhash":
+            launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration())
+        elif service_short_name == "cluster":
+            launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+        elif service_short_name == "fdlist":
+            launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+        elif service_short_name == "fdclean":
+            launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration())
+        else:
+            err_msg = f"Unknown service {service_short_name} specified. Must be one of {SERVICE_DICT.values()}"
+            self.logger.error(err_msg)
+            raise ValueError(err_msg)
+        status = launcher.launch()
+        return status
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Service Orchestrator")
+
+    # Define command line arguments
+    parser.add_argument("--input_folder", type=str, required=True, help="Input folder path")
+    parser.add_argument("--output_folder", type=str, required=True, help="Output folder path")
+
+    parser.add_argument(
+        "--operation_mode",
+        choices=["filter_duplicates", "filter_non_duplicates", "annotate"],
+        required=False,
+        help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents",
+    )
+    parser.add_argument(
+        "--contents_column", type=str, required=False, help="name of the column that stores document text"
+    )
+    parser.add_argument(
+        "--document_id_column", type=str, required=False, help="name of the column that stores document text"
+    )
+    parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text")
+    parser.add_argument(
+        "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation"
+    )
+    parser.add_argument(
+        "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation"
+    )
+    parser.add_argument(
+        "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band"
+    )
+    parser.add_argument(
+        "--word_shingle_size", type=int, required=False, help="number of words included in one shingle"
+    )
+    parser.add_argument(
+        "--jaccard_similarity_threshold",
+        type=float,
+        required=False,
+        help="jaccard similarity threshold above which two documents are similar",
+    )
+    parser.add_argument(
+        "--num_segments",
+        type=int,
+        required=False,
+        help="the number of segments dividing the hashing space for each band (for scalability)",
+    )
+    parser.add_argument(
+        "--duplicate_list_location",
+        type=str,
+        required=False,
+        help="path to the file with all the duplicate document ids",
+    )
+
+    # Single argument for service execution
+    parser.add_argument(
+        "--services",
+        type=str,
+        required=False,
+        default="SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning",
+        help="Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)",
+    )
+
+    parser.add_argument(
+        "--use_s3",
+        type=lambda x: bool(str2bool(x)),
+        default=False,
+        help="use s3",
+    )
+
+    parser.add_argument(
+        "--s3_cred",
+        type=ast.literal_eval,
+        default=None,
+        help="ast string of options for s3 credentials",
+    )
+    parser.add_argument(
+        "--shingle_option",
+        type=str,
+        required=False,
+        default="word",
+        help="Option used for shingling",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    # Parse command line arguments
+    args = parse_args()
+    # Initialize the orchestrator
+    orchestrator = ServiceOrchestrator(global_params=args)
+    # Launch python fuzzy dedup execution
+    orchestrator.orchestrate()
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
new file mode 100644
index 000000000..c49124cf1
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
@@ -0,0 +1,184 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import io
+import os
+import re
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Tuple
+
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.transform import AbstractFolderTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
+from Murmur_MH import Murmur_MH
+
+
+short_name = "fdlist"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+subfolder_key = "docs_to_remove"
+""" This key holds the name of the subfolder with the duplicate records"""
+consolidated_filename_key = "consolidated_filename"
+""" This key holds the name of the file with the consolidated list of duplicates"""
+sort_output_key = "sort_output"
+""" This key is used to sort"""
+
+# command line arguments
+subfolder_cli_param = f"{cli_prefix}{subfolder_key}"
+""" The name of the subfolder with the duplicate records"""
+consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}"
+""" The name of the file with the consolidated list of duplicates"""
+sort_output_cli_param = f"{cli_prefix}{sort_output_key}"
+""" Sort the output"""
+
+captured_arg_keys = [
+    subfolder_key,
+    consolidated_filename_key,
+    sort_output_key,
+]
+
+# defaults
+subfolder_default = "docs_to_remove"
+""" Default name of the subfolder with the duplicate records"""
+consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
+""" Default name of the file with the consolidated list of duplicates"""
+sort_output_default = False
+
+
+class GetDuplicateListTransform(AbstractFolderTransform):
+    """
+    This is an intermediate step of the fuzzy dedup pipeline. It runs in a single
+    location and consolidates in a single file all the duplicates found for each
+    band segment.
+    Args:
+        subfolder: name of the subfolder with the duplicate records
+        consolidated_filename: name of the file with the consolidated list of duplicates
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments
+        defined by the companion runtime, ClusterAnalysisTransformRuntime.
+        """
+        super().__init__(config)
+        self.subfolder = config.get(subfolder_key, subfolder_default)
+        self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default)
+        self.sort_output = config.get(sort_output_key, sort_output_default)
+        self.data_access = config.get("data_access")
+        self.logger = get_logger(__name__)
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        self.logger.info(f"Get Duplicate List for folder {folder_name}")
+        metadata = {}
+        input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name))
+        files, retries = self.data_access.get_folder_files(
+            path=input_folder,
+            extensions=[".parquet"],
+            return_data=True,
+        )
+        if retries > 0:
+            metadata |= {"data_access_retries": retries}
+        output_folder = self.sanitize_folder_name(self.data_access.output_folder)
+        output_path = os.path.join(output_folder, self.consolidated_filename)
+
+        # consolidate into a single data frame band hashes computed by workers
+        consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files)
+        self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates")
+        metadata |= consolidation_stats
+        output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow())
+        return [(output_data, output_path)], metadata
+
+    def sanitize_folder_name(self, folder_name: str) -> str:
+        if "://" in folder_name:
+            _, folder_name = folder_name.split("://")
+        if folder_name[-1] != "/":
+            folder_name = f"{folder_name}/"
+        return folder_name
+
+    def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
+        consolidated_dataframe = pl.DataFrame()
+        total_input_rows = 0
+        for fname, contents in files.items():
+            df = pl.read_parquet(io.BytesIO(contents))
+            total_input_rows += len(df)
+            self.logger.debug(f"{fname} has {len(df)} rows")
+            consolidated_dataframe = consolidated_dataframe.vstack(df)
+        consolidated_dataframe = consolidated_dataframe.select("docs_to_remove").unique()
+
+        consolidation_stats = {
+            "input_files": len(files),
+            "input_bytes": sum(len(v) for v in files.values()),
+            "input_rows": total_input_rows,
+            "consolidated_files": 1,
+            "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes,
+            "consolidated_rows": len(consolidated_dataframe),
+        }
+        if self.sort_output:
+            consolidated_dataframe = consolidated_dataframe.sort(by="docs_to_remove")
+
+        return consolidated_dataframe, consolidation_stats
+
+
+class GetDuplicateListTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=GetDuplicateListTransform,
+            remove_from_metadata=[],
+        )
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given  parser.
+        This will be included in a dictionary used to initialize the GetDuplicateListTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{subfolder_cli_param}",
+            type=str,
+            default=subfolder_default,
+            help="The name of the subfolder with the duplicate records",
+        )
+        parser.add_argument(
+            f"--{consolidated_filename_cli_param}",
+            type=str,
+            default=consolidated_filename_default,
+            help="The name of the file with the consolidated list of duplicates",
+        )
+        parser.add_argument(
+            f"--{sort_output_cli_param}",
+            type=bool,
+            default=sort_output_default,
+            help="Sort",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        return True
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
new file mode 100644
index 000000000..34b18ab04
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+# create parameters
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "cluster_analysis")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    print(sys.argv)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py
new file mode 100644
index 000000000..703ef630e
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py
@@ -0,0 +1,71 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import time
+from typing import Any
+
+from data_processing.data_access import DataAccess
+from data_processing.runtime.pure_python import (
+    DefaultPythonTransformRuntime,
+    PythonTransformLauncher,
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.utils import get_logger
+from get_duplicate_list_transform import (
+    GetDuplicateListTransformConfiguration,
+    subfolder_key,
+)
+
+
+logger = get_logger(__name__)
+
+
+class GetDuplicateListPythonRuntime(DefaultPythonTransformRuntime):
+    """
+    Get duplicate list runtime support for Python
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        return [self.params[subfolder_key]]
+
+
+class GetDuplicateListPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for Fuzzy Dedup GetDuplicateList
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(
+            transform_config=GetDuplicateListTransformConfiguration(),
+            runtime_class=GetDuplicateListPythonRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+    logger.info("Launching fuzzy dedup get duplicate list python transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
new file mode 100644
index 000000000..be395ed4d
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
@@ -0,0 +1,51 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+from ast import Param
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {"input_folder": input_folder, "output_folder": output_folder}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    "scdata_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "minhash_num_permutations": 112,
+    "minhash_num_bands": 14,
+    "minhash_num_segments": 2,
+}
+
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration())
+    # Launch python to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
new file mode 100644
index 000000000..6b14e1ba0
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -0,0 +1,519 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import os
+import re
+import unicodedata
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Any, List
+
+import mmh3
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.data_access import DataAccessFactory
+from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider
+from Murmur_MH import Murmur_MH
+
+
+short_name = "minhash"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+document_id_column_key = "document_id_column"
+""" This key holds the name of the column storing the unique ID assigned to each document"""
+contents_column_key = "contents_column"
+""" This key holds the name of the column storing the contents of each document"""
+seed_key = "seed"
+""" This key holds the seed used to instantiate the random number generator"""
+num_permutations_key = "num_permutations"
+""" This key holds the number of permutations that determine how many minhashes to calculate for each document"""
+num_bands_key = "num_bands"
+""" This key holds the number of bands to use in the banding technique"""
+num_minhashes_per_band_key = "num_minhashes_per_band"
+""" This key holds the number of minhashes to use in each band"""
+jaccard_similarity_threshold_key = "jaccard_similarity_threshold"
+""" This key holds the Jaccard similarity threshold above which two documents are duplicates"""
+word_shingle_size_key = "word_shingle_size"
+""" This key holds the size of the word shingles calculated for each document"""
+num_segments_key = "num_segments"
+""" This key holds the number of segments across which we divide the hashing space for each band"""
+shingle_option_key = "shingle_option"
+""" This key holds the option that is used to do shingles calculation for each document"""
+
+# command line arguments
+document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
+""" Name of the column storing the unique ID assigned to each document"""
+contents_column_cli_param = f"{cli_prefix}{contents_column_key}"
+""" Name of the column storing the contents of each document"""
+seed_cli_param = f"{cli_prefix}{seed_key}"
+""" The seed used to instantiate the random number generator"""
+num_permutations_cli_param = f"{cli_prefix}{num_permutations_key}"
+""" Number of permutations that determine how many minhashes to calculate for each document"""
+num_bands_cli_param = f"{cli_prefix}{num_bands_key}"
+""" The number of bands to use in the banding technique"""
+num_minhashes_per_band_cli_param = f"{cli_prefix}{num_minhashes_per_band_key}"
+""" The number of minhashes to use in each band"""
+jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}"
+""" Jaccard similarity threshold above which two documents are duplicates"""
+word_shingle_size_cli_param = f"{cli_prefix}{word_shingle_size_key}"
+""" The size of the word shingles calculated for each document"""
+num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
+""" The number of segments across which we divide the hashing space for each band"""
+shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}"
+""" The option (word/char) used to do shingles calculation for each document"""
+
+captured_arg_keys = [
+    document_id_column_key,
+    contents_column_key,
+    seed_key,
+    num_bands_key,
+    num_minhashes_per_band_key,
+    num_permutations_key,
+    jaccard_similarity_threshold_key,
+    word_shingle_size_key,
+    num_segments_key,
+    shingle_option_key,
+]
+
+# defaults
+document_id_column_default = "int_id_column"
+""" Default name of the column storing the unique ID assigned to each document"""
+contents_column_default = "contents"
+""" Default name of the column storing the contents of each document"""
+seed_default = 42
+""" Default seed used to instantiate the random number generator"""
+num_permutations_default = 112
+""" Default number of minhashes used for each document (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_bands_default = 14
+""" Default number of bands to use in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_minhashes_per_band_default = 8
+""" Default number of minhashes to use in each band (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+word_shingle_size_default = 5
+""" Default size of the word shingles (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+jaccard_similarity_threshold_default = 0.75
+""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_segments_default = 1
+""" Default number of segments across which we divide the hashing space for each band"""
+shingle_option_default = "word"
+""" Default option of doing shingling"""
+
+
+sigcalc_data_factory_key = "sc_data_factory"
+sigcalc_data_access_key = "sc_data_access"
+
+
+NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?")
+WHITESPACE_PATTERN = re.compile(r"\s+")
+PUNCTUATION = "!/—”:％１〈&(、━\\【#%「」，】；+^]~“《„';’{|∶´[=-`*．（–？！：$～«〉,><》)?）。…@_.\"}►»" + "".join(
+    map(
+        chr,
+        (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)),
+    )
+)
+PUNCTUATION_SET = set(PUNCTUATION)
+PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
+
+
+class SignatureCalculationTransform(AbstractTableTransform):
+    """
+    This is the first transform of the fuzzy dedup pipeline. First, it calculates,
+    for each document in a dataset, `num_permutations` minhashes.  It accepts as
+    input the number of bands and the length of each band.  If those two parameters
+    are not specified, then, based on the values of `jaccard_similarity_threshold`
+    and `num_permutations`, it determines the optimal number of bands, and the
+    length of each band (how many minhashes will be used to get the signature for
+    each band). The band signatures, the minhashes and the document lengths are
+    then saved in the output folder, under a folder structure `bands/band=b/segment=s`.
+    To improve scalability of the next step of fuzzy dedup, the hash space of
+    each band is divided into `num_segments` segments.
+
+    Args:
+        document_id_column: name of the column storing the unique ID assigned to each document
+        contents_column_cli_param: name of the column storing the contents of each document
+        seed: the seed used to instantiate the random number generator
+        num_permutations: number of minhashes to calculate for each document
+        num_bands: number of bands to use for banding technique
+        num_minhashes_per_band: number of minhashes to use in each band
+        jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
+        word_shingle_size: the size of the word shingles calculated for each document
+        num_segments: the number of segments across which we divide the hashing space for each band
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments defined
+        by the companion runtime, SignatureCalculationTransformRuntime.  If running inside the RayMutatingDriver,
+        these will be provided by that class with help from the RayMutatingDriver.
+        """
+        super().__init__(config)
+        self.document_id_column = config.get(document_id_column_key, document_id_column_default)
+        self.contents_column = config.get(contents_column_key, contents_column_default)
+        self.seed = config.get(seed_key, seed_default)
+        self.num_permutations = config.get(num_permutations_key, num_permutations_default)
+        self.jaccard_similarity_threshold = config.get(
+            jaccard_similarity_threshold_key, jaccard_similarity_threshold_default
+        )
+        self.word_shingle_size = config.get(word_shingle_size_key, word_shingle_size_default)
+        self.num_segments = config.get(num_segments_key, num_segments_default)
+        self.num_bands = config.get(num_bands_key, num_bands_default)
+        self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
+        self.shingle_option = config.get(shingle_option_key, shingle_option_default)
+        # use this dataframe to store the minhashes and size for each document
+        self.all_minhashes: pl.DataFrame = None
+        # use this dataframe to store the band hashes for each document
+        self.all_band_hashes: pl.DataFrame = None
+        # this variable keeps track of how many files were processed since last
+        # data write to properly update metadata
+        self.files_processed = 0
+        self.bytes_processed = 0
+        self.data_access = config.get("data_access")
+        self.last_file_name = None
+        self.sc_data_access = config.get(sigcalc_data_access_key, None)
+        if self.sc_data_access is None:
+            self.sc_daf = config.get(sigcalc_data_factory_key, None)
+            if self.sc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}")
+            self.sc_data_access = self.sc_daf.create_data_access()
+
+    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
+        """
+        Put Transform-specific to convert one Table to 0 or more tables. It also returns
+        a dictionary of execution statistics - arbitrary dictionary
+        This implementation makes no modifications so effectively implements a copy of the
+        input parquet to the output folder, without modification.
+        """
+        self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}")
+        self.logger.debug("----minhash---")
+        self.last_file_name = file_name
+        self.files_processed += 1
+        self.bytes_processed += table.nbytes
+        # instantiate with same seed so every worker use same hash functions
+        mm_min_hash = Murmur_MH(num_perm=self.num_permutations, seed=self.seed)
+
+        # load the data from pyarrow table
+        df = pl.from_arrow(table)
+        # read the target columns
+        df = df.select(self.contents_column, self.document_id_column)
+
+        # generate minhash values
+        minhashes = df.map_rows(
+            lambda row: mm_min_hash.minhash2_nosalt(
+                *self._generate_word_shingles(row, self.shingle_option, window_size=self.word_shingle_size)
+            )
+        )
+        # rename columns, cast minhashes to list(uint32)
+        minhashes = minhashes.select(
+            pl.col("column_2").alias(self.document_id_column),
+            pl.col("column_0").cast(pl.List(pl.UInt32)).alias("minhashes"),
+            pl.col("column_1").alias("document_length"),
+        )
+        # store the minhash calculations to send out at the end of execution
+        if self.all_minhashes is None:
+            self.all_minhashes = minhashes
+        else:
+            self.all_minhashes = self.all_minhashes.vstack(minhashes)
+
+        # Calculate band hashes
+        band_hashes_list = self.process_rows_into_bands(
+            minhashes,
+            self.num_bands,
+            self.num_rows,
+        )
+        band_hash_schema = pl.Schema(
+            {
+                "band_hash": pl.UInt64,
+                "band_index": pl.Int32,
+                self.document_id_column: pl.Int64,
+            }
+        )
+        band_hashes = pl.DataFrame(band_hashes_list, schema=band_hash_schema)
+
+        # store the band hash calculations to send out at the end of execution
+        if self.all_band_hashes is None:
+            self.all_band_hashes = band_hashes
+        else:
+            self.all_band_hashes = self.all_band_hashes.vstack(band_hashes)
+
+        if len(self.all_minhashes) > 750000:
+            tables, metadata = self.write_band_signatures()
+        else:
+            tables = []
+            metadata = {}
+        # update metadata stats and return the stats (no tables are returned in transform)
+        return tables, metadata
+
+    def flush(self) -> tuple[list[pa.Table], dict[str, Any]]:
+        """
+        This is supporting method for transformers, that implement buffering of tables, for example coalesce.
+        These transformers can have buffers containing tables that were not written to the output. Flush is
+        the hook for them to return back locally stored tables and their statistics. The majority of transformers
+        should use default implementation.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
+        :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
+        propagated to metadata
+        """
+        self.logger.info(f"Starting flush()")
+        if self.all_band_hashes is not None and self.all_minhashes is not None:
+            tables, metadata = self.write_band_signatures()
+        else:
+            tables = []
+            metadata = {}
+        return tables, metadata
+
+    def write_band_signatures(self):
+        # define the upper and lower bounds of each band segment
+        segment_bounds_list = []
+        upper_bound = np.uint64(np.iinfo(np.uint64).max)
+        segment_len = np.uint64(upper_bound // self.num_segments)
+        for segment_index in range(self.num_segments):
+            segment_bounds_list.append(np.uint64(segment_index) * segment_len)
+        segment_bounds_list.append(upper_bound)
+        segment_bounds = np.array(segment_bounds_list, dtype=np.uint64)
+        self.logger.debug(f"Calculated {len(segment_bounds)} segment_bounds")
+        # output stats for the metadata
+        num_tables_written = 0
+        num_docs_written = 0
+        num_bytes_written = 0
+        self.logger.debug(f"dataframe self.all_band_hashes has {len(self.all_band_hashes)} rows")
+        self.logger.debug(f"dataframe self.all_minhashes has {len(self.all_minhashes)} rows")
+        # iterate through the bands, get the band hashes for each band, divide
+        # them into segments, join with minhashes, and upload to storage
+        for band_ix in range(self.num_bands):
+            # Filtering on, then dropping the `band_index` column
+            band_df = self.all_band_hashes.filter(pl.col("band_index") == band_ix).drop("band_index")
+            # assign each band hash to a segment of the hashing space
+            self.logger.debug(f"band {band_ix} band_df has {len(band_df)} rows")
+            for segment_index in range(self.num_segments):
+                segment_band_df = band_df.filter(
+                    (pl.col("band_hash") > segment_bounds[segment_index])
+                    & (pl.col("band_hash") <= segment_bounds[segment_index + 1])
+                )
+                self.logger.debug(
+                    f"band {band_ix} segment {segment_index} segment_band_df has {len(segment_band_df)} rows"
+                )
+                # join the band hash dataframe with the minihash and doc length dataframe
+                segment_band_minhash_df = segment_band_df.join(
+                    self.all_minhashes,
+                    on=self.document_id_column,
+                    how="inner",
+                )
+                self.logger.debug(f"band {band_ix} segment {segment_index} joined segment_band_df and minhashes")
+
+                # encapsulate document info in a structure
+                segment_band_minhash_df = segment_band_minhash_df.select(
+                    pl.col("band_hash"),
+                    pl.struct(
+                        [
+                            pl.col(self.document_id_column),
+                            pl.col("minhashes"),
+                            pl.col("document_length"),
+                        ]
+                    ).alias("document_data"),
+                )
+                self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure")
+
+                # append the table to the result list, and the path to metadata
+                common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name])
+                last_file_name_path = Path(self.last_file_name)
+                suffix_path = last_file_name_path.relative_to(self.data_access.input_folder)
+                if self.sc_data_access.output_folder is None:
+                    self.sc_data_access.output_folder = self.data_access.output_folder
+                save_path = os.path.join(
+                    self.sc_data_access.output_folder,
+                    "bands",
+                    f"band={band_ix}",
+                    f"segment={segment_index}",
+                    suffix_path,
+                )
+                segment_band_minhash_table = segment_band_minhash_df.to_arrow()
+                bytes_written, _, _ = self.sc_data_access.save_table(save_path, segment_band_minhash_table)
+                if bytes_written > 0:
+                    num_tables_written += 1
+                    num_docs_written += segment_band_minhash_table.num_rows
+                    num_bytes_written += bytes_written
+                    self.logger.debug(f"Uploaded table for band {band_ix} and segment {segment_index}")
+        # add the stats to metadata
+        metadata = {
+            "input_files": self.files_processed,
+            "input_docs": len(self.all_minhashes),
+            "input_bytes": self.bytes_processed,
+            "output_files": num_tables_written,
+            "output_docs": num_docs_written,
+            "output_bytes": num_bytes_written,
+        }
+        self.logger.info(f"Wrote {num_tables_written} tables with a total size of {num_bytes_written:,d} bytes")
+        self.files_processed = 0
+        self.bytes_processed = 0
+        self.all_minhashes = None
+        self.all_band_hashes = None
+        return [], metadata
+
+    # define shingles generation function
+    def _generate_word_shingles(
+        self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " "
+    ) -> tuple[list, int, int]:
+        text = row[0]
+        # lower case
+        text = text.lower()
+        # replace numbers with '0'
+        text = NUMBERS_PATTERN.sub("0", text)
+        # convert punctuation to spaces
+        text = text.translate(PUNCTUATION_TRANS)
+        # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end
+        text = WHITESPACE_PATTERN.sub(" ", text.strip())
+        # diacritics/unicode normalization
+        text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
+        text = text.strip()
+        self.logger.debug(shingling_option)
+        if shingling_option == "char":
+            words = list(text)
+        else:
+            words = text.split()
+        document_id = row[1]
+        doc_len = len(row[0])
+        word_count = len(words)
+        k_shingles = []
+        for i in range(0, max(1, word_count - window_size + 1)):
+            k_shingles.append(delimiter.join(words[i : i + window_size]))
+        return k_shingles, doc_len, document_id
+
+    def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42):
+        num_minhashes = len(minhashes)
+        assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}"
+        results = []
+        for band_index in range(b):
+            band_hash, _ = mmh3.hash64(
+                minhashes[band_index * r : (band_index + 1) * r],
+                seed=seed,
+                signed=False,
+            )
+            results.append((band_hash, band_index, int_id_column))
+        return results
+
+    # Apply the function
+    def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band):
+        result = []
+        for row in df.iter_rows():
+            bands = self.emit_bands(
+                row[0],  # document id
+                np.array(row[1], dtype=np.uint32),  # minhashes
+                row[2],  # document length
+                minhashlsh_num_bands,
+                minhashlsh_length_band,
+            )
+            for band in bands:
+                result.append(band)
+        return result
+
+
+class SignatureCalculationTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=SignatureCalculationTransform,
+            remove_from_metadata=[sigcalc_data_factory_key],
+        )
+        self.daf = DataAccessFactory(cli_arg_prefix="scdata_")
+
+        from data_processing.utils import get_logger
+
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given  parser.
+        This will be included in a dictionary used to initialize the NOOPTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{document_id_column_cli_param}",
+            type=str,
+            default=document_id_column_default,
+            help="name of the column storing the unique ID assigned to each document",
+        )
+        parser.add_argument(
+            f"--{contents_column_cli_param}",
+            type=str,
+            default=contents_column_default,
+            help="name of the column storing the contents of each document",
+        )
+        parser.add_argument(
+            f"--{seed_cli_param}",
+            type=int,
+            default=seed_default,
+            help="the seed used to instantiate the random number generator",
+        )
+        parser.add_argument(
+            f"--{num_permutations_cli_param}",
+            type=int,
+            default=num_permutations_default,
+            help="number of permutations (minhashes) calculated for each document",
+        )
+        parser.add_argument(
+            f"--{jaccard_similarity_threshold_cli_param}",
+            type=float,
+            default=jaccard_similarity_threshold_default,
+            help="Jaccard similarity threshold above which two documents are duplicates",
+        )
+        parser.add_argument(
+            f"--{word_shingle_size_cli_param}",
+            type=int,
+            default=word_shingle_size_default,
+            help="the size of the word shingles calculated for each document",
+        )
+        parser.add_argument(
+            f"--{num_bands_cli_param}",
+            type=int,
+            default=num_bands_default,
+            help="the number of bands to use in the banding technique",
+        )
+        parser.add_argument(
+            f"--{num_minhashes_per_band_cli_param}",
+            type=int,
+            default=num_minhashes_per_band_default,
+            help="the number of minhashes to use in each band",
+        )
+        parser.add_argument(
+            f"--{num_segments_cli_param}",
+            type=int,
+            default=num_segments_default,
+            help="the number of segments across which we divide the hashing space for each band",
+        )
+        parser.add_argument(
+            f"--{shingle_option_cli_param}",
+            type=str,
+            default=shingle_option_default,
+            help="Shingling option",
+        )
+        self.daf.add_input_params(parser=parser)
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        self.params[sigcalc_data_factory_key] = self.daf
+        return self.daf.apply_input_params(args=args)
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
new file mode 100644
index 000000000..5ddc102eb
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
@@ -0,0 +1,44 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import time
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.runtime.pure_python.runtime_configuration import (
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.utils import get_logger
+from signature_calc_transform import SignatureCalculationTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class SignatureCalculationPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(transform_config=SignatureCalculationTransformConfiguration())
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration())
+    logger.info("Launching noop transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet
new file mode 100644
index 000000000..d67b5bcf8
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet
new file mode 100644
index 000000000..267e78385
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json
new file mode 100644
index 000000000..de47f367b
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:34:04",
+        "end_time": "2024-10-18 10:34:04",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 96.1,
+        "gpus": 0,
+        "memory": 23.82,
+        "object_store": 0,
+        "execution time, min": 0.006
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.341,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
new file mode 100644
index 000000000..79fe53b62
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
new file mode 100644
index 000000000..9df2f3bd5
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
new file mode 100644
index 000000000..f5da05a10
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
new file mode 100644
index 000000000..0e089dee3
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
new file mode 100644
index 000000000..4b0fecb15
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
new file mode 100644
index 000000000..5601f5cb0
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
new file mode 100644
index 000000000..02bedff1c
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
new file mode 100644
index 000000000..bf131f43c
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
new file mode 100644
index 000000000..d41b35de2
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
new file mode 100644
index 000000000..06b4b7467
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
new file mode 100644
index 000000000..ca5323db5
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
new file mode 100644
index 000000000..2838dd972
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
new file mode 100644
index 000000000..7cb2cbac4
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
new file mode 100644
index 000000000..79fe53b62
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
new file mode 100644
index 000000000..9de625746
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
new file mode 100644
index 000000000..9df2f3bd5
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
new file mode 100644
index 000000000..8e1fe121e
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
new file mode 100644
index 000000000..37aea5168
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
new file mode 100644
index 000000000..3d1f158e9
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
new file mode 100644
index 000000000..ca5323db5
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
new file mode 100644
index 000000000..06b4b7467
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
new file mode 100644
index 000000000..c08326355
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -0,0 +1,58 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "cluster",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:32:15",
+        "end_time": "2024-10-18 10:32:15",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "jaccard_similarity_threshold": 0.7,
+        "num_bands": 14,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 91.7,
+        "gpus": 0,
+        "memory": 24.01,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 28,
+        "result_size": 38040,
+        "processing_time": 0.061,
+        "input_files": 28,
+        "input_bytes": 115324,
+        "input_rows": 168,
+        "consolidated_files": 28,
+        "consolidated_bytes": 80640,
+        "consolidated_rows": 168,
+        "groupby_clusters": 35,
+        "cluster_duplicate_docs": 79,
+        "jaccard_clusters": 35,
+        "jaccard_duplicate_docs": 44,
+        "num_duplicate_documents": 44
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
new file mode 100644
index 000000000..d67b5bcf8
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
new file mode 100644
index 000000000..267e78385
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
new file mode 100644
index 000000000..717d9bbe9
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:10:22",
+        "end_time": "2024-10-18 10:10:23",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 112.7,
+        "gpus": 0,
+        "memory": 24.17,
+        "object_store": 0,
+        "execution time, min": 0.005
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.308,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 000000000..edbd80b43
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 000000000..34b15a76c
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json
new file mode 100644
index 000000000..d4cd3e362
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:49:10",
+        "end_time": "2024-10-18 10:49:10",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 101.1,
+        "gpus": 0,
+        "memory": 24.02,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.007,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json
new file mode 100644
index 000000000..ba1f5b0a6
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json
@@ -0,0 +1,49 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 13:22:42",
+        "end_time": "2024-10-18 13:22:42",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "sort_output": false,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 32.5,
+        "gpus": 0,
+        "memory": 13.31,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.047,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..c7d3d8072
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..c355b299a
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..ad59ee31c
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..fb2a0b13d
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..aca2026d8
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..1a46cb40f
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..56934cab8
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..f82d9daca
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..842ce2caa
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..fcb03c17a
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..84c399e67
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..79a6f24b3
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..e67164596
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..cd2e75eaa
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..5212dff6d
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..d0f1bd9b4
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..1cc7b2c26
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..f892d384d
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..1a786300b
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..bc20a7699
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..151008dc4
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..b485d3882
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..0da33db3c
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..1e1b4765c
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..7e9af93b0
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..d112e179e
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet
new file mode 100644
index 000000000..f3f7d2a7d
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet
new file mode 100644
index 000000000..06444accf
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
new file mode 100644
index 000000000..8a62a81b2
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:08:23",
+        "end_time": "2024-10-18 10:08:23",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 112.8,
+        "gpus": 0,
+        "memory": 24.15,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.006,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet
new file mode 100644
index 000000000..c9220bf39
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet differ
diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet
new file mode 100644
index 000000000..23fac4c72
Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
new file mode 100644
index 000000000..cecd224fe
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
@@ -0,0 +1,48 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from cluster_analysis_transform import sort_output_cli_param
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+
+
+class TestPythonClusterAnalysisTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "cluster_num_bands": 14,
+            "cluster_num_segments": 2,
+            "cluster_jaccard_similarity_threshold": 0.7,
+            sort_output_cli_param: True,
+        }
+        launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                basedir + "/expected/signature_calc/bands",
+                basedir + "/expected/cluster_analysis/docs_to_remove",
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
new file mode 100644
index 000000000..8c4debed9
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
@@ -0,0 +1,49 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+
+
+class TestPythonDataCleaningTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        duplicate_location = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "test-data",
+                "expected/get_list_transform/docs_to_remove_consolidated",
+                "docs_to_remove_consolidated.parquet",
+            )
+        )
+        config = {
+            document_id_column_cli_param: "int_id_column",
+            duplicate_list_location_cli_param: duplicate_location,
+        }
+        launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration())
+        fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")]
+        return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py
new file mode 100644
index 000000000..4b59e3a7a
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py
@@ -0,0 +1,45 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from get_duplicate_list_transform import sort_output_cli_param
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            sort_output_cli_param: True,
+        }
+        launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "cluster_analysis"),
+                os.path.join(basedir, "expected", "get_list_transform"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
new file mode 100644
index 000000000..9ad8a32d7
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
@@ -0,0 +1,40 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.utils import ParamsUtils
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "minhash_num_permutations": 112,
+            "minhash_num_bands": 14,
+            "minhash_num_segments": 2,
+        }
+        launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration())
+        fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index 0b2e9cf1a..4bfe32a9e 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -1,5 +1,4 @@
-ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
-
+ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310
 FROM ${BASE_IMAGE}
 
 RUN pip install --upgrade --no-cache-dir pip 
@@ -14,24 +13,31 @@ COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
 ## Copy the python version of the tansform
+COPY --chown=ray:users python-transform/  python-transform/
+RUN cd python-transform && pip install --no-cache-dir -e .
 
 # Install ray project source
 COPY --chown=ray:users src/ src/
 COPY --chown=ray:users pyproject.toml pyproject.toml
 COPY --chown=ray:users README.md README.md
-COPY --chown=ray:users images/ images/
+COPY --chown=ray:users requirements.txt requirements.txt
 RUN pip install --no-cache-dir -e .
 
-# copy the main() entry point to the image 
-COPY ./src/fdedup_transform_ray.py .
-
-# copy some of the samples in
-COPY src/fdedup_local_ray.py local/
+# copy source files needed by test-image
+COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py
+COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py
+COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py
+COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py
+COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py
+COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py
 
 # copy test
 COPY test/ test/
 COPY test-data/ test-data/
 
+USER root
+RUN chmod a+rwx /home/ray
+USER ray
 # Set environment
 ENV PYTHONPATH /home/ray
 
diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile
index f5f06c3c3..ec193b6c3 100644
--- a/transforms/universal/fdedup/ray/Makefile
+++ b/transforms/universal/fdedup/ray/Makefile
@@ -43,7 +43,7 @@ setup:: .transforms.setup
 
 # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation
 set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions 
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions 
 
 build-dist:: .defaults.build-dist
 
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index 923cbdf82..cb8c6306a 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -6,20 +6,16 @@ description = "fdedup Ray Transform"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [
-    { name = "David Wood", email = "dawood@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
-]
-dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
-    "mmh3>=4.1.0",
-    "xxhash==3.4.1",
-    "tqdm==4.66.3",
-    "scipy>=1.12.0, <2.0.0"
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
+dynamic = ["dependencies"]
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
 build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
 
 [project.optional-dependencies]
 dev = [
diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt
new file mode 100644
index 000000000..6ee40ef7f
--- /dev/null
+++ b/transforms/universal/fdedup/ray/requirements.txt
@@ -0,0 +1,6 @@
+data-prep-toolkit[ray]==0.2.2.dev2
+dpk_fdedup_transform_python==0.2.2.dev2
+mmh3>=4.1.0
+xxhash==3.4.1
+tqdm==4.66.3
+scipy>=1.12.0, <2.0.0
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
new file mode 100644
index 000000000..c54ba85c2
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
@@ -0,0 +1,53 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+
+
+# create parameters
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+worker_options = {"num_cpus": 0.8}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # where to run
+    "run_locally": True,
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
+    "runtime_num_workers": 3,
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_creation_delay": 0,
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
new file mode 100644
index 000000000..a0e8e7de2
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
@@ -0,0 +1,74 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+from cluster_analysis_transform import (
+    ClusterAnalysisTransformConfiguration,
+    num_bands_key,
+    num_segments_key,
+)
+from data_processing.data_access import DataAccess
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray import (
+    DefaultRayTransformRuntime,
+    RayTransformLauncher,
+    RayTransformRuntimeConfiguration,
+)
+
+
+logger = get_logger(__name__)
+
+
+class ClusterAnalysisRayRuntime(DefaultRayTransformRuntime):
+    """
+    Cluster analysis runtime support for Ray
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        bands = self.params[num_bands_key]
+        segments = self.params[num_segments_key]
+        folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
+        return folders
+
+
+class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for Fuzzy Dedup Cluster Analysis
+    as required by the RayTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(
+            transform_config=ClusterAnalysisTransformConfiguration(),
+            runtime_class=ClusterAnalysisRayRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
+    logger.info("Launching fuzzy dedup cluster analysis ray transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/compute_shingles.py b/transforms/universal/fdedup/ray/src/compute_shingles.py
deleted file mode 100644
index 2db75ebe2..000000000
--- a/transforms/universal/fdedup/ray/src/compute_shingles.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import string
-
-
-"""
-This implements the most simplistic splitting of document based on the white spaces
-that can be overwritten by a different document splitter (tokenizer). This method is
-build in the library and can be overwritten using approach described at 
-https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library
-
-import compute_shingles
-compute_shingles.compute_shingles = my_local_compute_shingles
-"""
-
-
-def _find(s: str, ch: str) -> list[int]:
-    """
-    Get indexes of all locations of character in string
-    :param s: string
-    :param ch: character
-    :return: list of locations
-    """
-    return [i for i, ltr in enumerate(s) if ltr == ch]
-
-
-def compute_shingles(txt: str, word_shingle_size: int, delimiter: str = " ") -> list[str]:
-    """
-    Generate word shingles
-    :param txt: document
-    :param delimiter: delimiter to split document
-    :param word_shingle_size: size of shingle in words
-    :return: list of shingles
-    """
-    text = txt.replace("\n", "").lower().translate(str.maketrans("", "", string.punctuation))
-    separators = _find(text, delimiter)
-    if len(separators) + 1 <= word_shingle_size:
-        return [text]
-    bounds = [-1] + separators + [len(text)]
-    return [text[bounds[i] + 1 : bounds[i + word_shingle_size]] for i in range(0, len(bounds) - word_shingle_size)]
diff --git a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
similarity index 59%
rename from transforms/universal/fdedup/ray/src/fdedup_local_ray.py
rename to transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
index af7bec71c..b951e2fc8 100644
--- a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
@@ -13,59 +13,57 @@
 import os
 import sys
 
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_ray import FdedupRayTransformConfiguration
 
 
-# create launcher
-launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
 }
+duplicate_location = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "test-data",
+        "expected",
+        "docs_to_remove_consolidated",
+        "docs_to_remove_consolidated.parquet",
+    )
+)
 worker_options = {"num_cpus": 0.8}
+
 code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
 params = {
     # where to run
     "run_locally": True,
     # Data access. Only required parameters are specified
     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
-    # Orchestration parameters
-    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
-    "runtime_num_workers": 1,
+    document_id_column_cli_param: "int_id_column",
+    duplicate_list_location_cli_param: duplicate_location,
+    # execution info
     "runtime_pipeline_id": "pipeline_id",
     "runtime_job_id": "job_id",
     "runtime_creation_delay": 0,
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
-    # columns used
-    "fdedup_doc_column": "contents",
-    "fdedup_id_column": "int_id_column",
-    "fdedup_cluster_column": "cluster",
-    # infrastructure
-    "fdedup_bucket_cpu": 0.5,
-    "fdedup_doc_cpu": 0.5,
-    "fdedup_mhash_cpu": 0.5,
-    "fdedup_num_doc_actors": 1,
-    "fdedup_num_bucket_actors": 1,
-    "fdedup_num_minhash_actors": 1,
-    "fdedup_num_preprocessors": 2,
-    # fuzzy parameters
-    "fdedup_num_permutations": 64,
-    "fdedup_threshold": 0.8,
-    "fdedup_shingles_size": 5,
-    "fdedup_delimiters": " ",
-    # Random delay between reads
-    "fdedup_random_delay_limit": 5,
-    # snapshotting
-    "fdedup_snapshot_delay": 1,
-    "fdedup_use_doc_snapshot": False,
-    "fdedup_use_bucket_snapshot": False,
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
+    "runtime_num_workers": 3,
 }
-sys.argv = ParamsUtils.dict_to_req(d=params)
 
-# launch
-launcher.launch()
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
new file mode 100644
index 000000000..88171e260
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
@@ -0,0 +1,138 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+import ray
+from data_cleaning_transform import (
+    DataCleaningTransform,
+    DataCleaningTransformConfiguration,
+    dataclean_data_access_key,
+    dataclean_data_factory_key,
+    duplicate_list_location_default,
+    duplicate_list_location_key,
+)
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray import (
+    DefaultRayTransformRuntime,
+    RayTransformLauncher,
+)
+from data_processing_ray.runtime.ray.runtime_configuration import (
+    RayTransformRuntimeConfiguration,
+)
+from ray.actor import ActorHandle
+
+
+logger = get_logger(__name__)
+
+
+class DataCleaningRayTransform(DataCleaningTransform):
+    """ """
+
+    def __init__(self, config: dict):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments defined
+        by the companion runtime, LangSelectorTransformRuntime.  If running inside the RayMutatingDriver,
+        these will be provided by that class with help from the RayMutatingDriver.
+        """
+        docs2removedf = config.get("df", None)
+        if docs2removedf is not None:
+            # This is recommended for production approach. In this case domain list is build by the
+            # runtime once, loaded to the object store and can be accessed by actors without additional reads
+            try:
+                config["df"] = ray.get(config.get("df"))
+            except Exception as e:
+                self.logger.warning(f"Exception loading docs2remove list from ray object storage {e}")
+                raise RuntimeError(f"exception loading from object storage for key {docs2removedf}")
+        super().__init__(config)
+
+
+class DataCleaningRuntime(DefaultRayTransformRuntime):
+    """
+    Ingest Data cleaning runtime support
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        """
+        Create filter runtime
+        :param params: parameters, that should include
+            ingest_supported_langs_file_key: supported languages file
+            ingest_detect_programming_lang_key: whether to detect programming language
+            ingest_domain_key: domain
+            ingest_snapshot_key: snapshot
+        """
+        super().__init__(params)
+        from data_processing.utils import get_logger
+
+        self.logger = get_logger(__name__)
+
+    def get_transform_config(
+        self,
+        data_access_factory: DataAccessFactoryBase,
+        statistics: ActorHandle,
+        files: list[str],
+    ) -> dict[str, Any]:
+        """
+        Set environment for filter execution
+        :param data_access_factory - data access factory
+        :param statistics - reference to the statistics object
+        :param files - list of files to remove
+        :return: dictionary of filter init params
+        """
+        data_access = data_access_factory.create_data_access()
+        dc_data_access = self.params.get(dataclean_data_access_key, None)
+        if dc_data_access is None:
+            dc_daf = self.params.get(dataclean_data_factory_key, None)
+            if dc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}")
+            dc_data_access = dc_daf.create_data_access()
+        if dc_data_access.output_folder is None:
+            dc_data_access.output_folder = data_access.output_folder
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        if not duplicate_list_location.startswith("/"):
+            out_paths = dc_data_access.output_folder.rstrip("/").split("/")
+            dupl_list_paths = duplicate_list_location.split("/")
+            paths = out_paths[:-1] + dupl_list_paths
+            duplicate_list_location = "/".join([p.strip("/") for p in paths])
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        duplicate_list, retries = dc_data_access.get_file(duplicate_list_location)
+        docs_to_remove_list = ray.put(duplicate_list)
+        return {"df": docs_to_remove_list} | self.params
+
+
+class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(
+            transform_config=DataCleaningTransformConfiguration(transform_class=DataCleaningRayTransform),
+            runtime_class=DataCleaningRuntime,
+        )
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration())
+    logger.info("Launching transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py b/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py
deleted file mode 100644
index 285fcfa22..000000000
--- a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import sys
-
-from data_processing.utils import ParamsUtils
-from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_ray import FdedupRayTransformConfiguration
-
-
-# create launcher
-launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
-# create parameters
-s3_cred = {
-    "access_key": "localminioaccesskey",
-    "secret_key": "localminiosecretkey",
-    "url": "http://localhost:9000",
-}
-
-s3_conf = {
-    "input_folder": "test/fdedup/input",
-    "output_folder": "test/fdedup/output",
-}
-worker_options = {"num_cpus": 0.8}
-code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
-params = {
-    # where to run
-    "run_locally": True,
-    # Data access. Only required parameters are specified
-    "data_s3_config": ParamsUtils.convert_to_ast(s3_conf),
-    "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
-    # Orchestration parameters
-    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
-    "runtime_num_workers": 5,
-    "runtime_pipeline_id": "pipeline_id",
-    "runtime_job_id": "job_id",
-    "runtime_creation_delay": 0,
-    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
-    # columns used
-    "fdedup_doc_column": "contents",
-    "fdedup_id_column": "int_id_column",
-    "fdedup_cluster_column": "cluster",
-    # infrastructure
-    "fdedup_bucket_cpu": 0.5,
-    "fdedup_doc_cpu": 0.5,
-    "fdedup_mhash_cpu": 0.5,
-    "fdedup_num_doc_actors": 2,
-    "fdedup_num_bucket_actors": 1,
-    "fdedup_num_minhash_actors": 1,
-    "fdedup_num_preprocessors": 2,
-    # fuzzy parameters
-    "fdedup_num_permutations": 64,
-    "fdedup_threshold": 0.8,
-    "fdedup_shingles_size": 5,
-    "fdedup_delimiters": " ",
-    # Random delay between reads
-    "fdedup_random_delay_limit": 5,
-    # snapshotting
-    "fdedup_snapshot_delay": 1,
-    "fdedup_use_doc_snapshot": False,
-    "fdedup_use_bucket_snapshot": False,
-}
-sys.argv = ParamsUtils.dict_to_req(d=params)
-
-
-# launch
-launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/fdedup_support.py b/transforms/universal/fdedup/ray/src/fdedup_support.py
deleted file mode 100644
index 60afb84bf..000000000
--- a/transforms/universal/fdedup/ray/src/fdedup_support.py
+++ /dev/null
@@ -1,621 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import pickle
-import time
-from typing import Any, Iterator, Union
-
-import numpy as np
-import ray
-from data_processing.data_access import SnapshotUtils
-from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger
-from data_processing_ray.runtime.ray import RayUtils
-from ray.actor import ActorHandle
-from ray.util import ActorPool
-from scipy.integrate import quad as integrate
-
-
-NO_SIMILARITY = -1
-REQUEST_LEN = 4096
-LONG_BUCKET = 5000
-LONG_BUCKET_PRINT = 1000
-
-
-def fuzzy_optimal_param(
-    threshold: float,
-    num_perm: int,
-    false_positive_weight: float,
-    false_negative_weight: float,
-) -> tuple[int, int]:
-    """
-    Computes parameters for fuzzy dedup
-    :param threshold: filtering threshold
-    :param num_perm: number of permutations
-    :param false_positive_weight: false positive weight
-    :param false_negative_weight: false negative weight
-    :return: number of buckets and bucket length
-    """
-
-    def _false_positive_probability(ths: float, b: int, r: int) -> float:
-        """
-        Compute false positive probability
-        :param ths: filtering threshold
-        :param b: permutation
-        :param r: rel permutation
-        :return: probability
-        """
-        _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
-        a, err = integrate(_probability, 0.0, ths)
-        return a
-
-    def _false_negative_probability(ths: float, b: int, r: int) -> float:
-        """
-        Compute false negative probability
-        :param ths: filtering threshold
-        :param b: permutation
-        :param r: rel permutation
-        :return: probability
-        """
-        _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
-        a, err = integrate(_probability, ths, 1.0)
-        return a
-
-    min_error = float("inf")
-    opt = (0, 0)
-    for perm in range(1, num_perm + 1):
-        max_r = int(num_perm / perm)
-        for rel in range(1, max_r + 1):
-            fp = _false_positive_probability(threshold, perm, rel)
-            fn = _false_negative_probability(threshold, perm, rel)
-            error = fp * false_positive_weight + fn * false_negative_weight
-            if error < min_error:
-                min_error = error
-                opt = (perm, rel)
-    return opt
-
-
-class MurmurMH:
-    def __init__(self, num_perm: int, seed: int = RANDOM_SEED):
-        self.seed = seed
-        self.num_perm = num_perm
-        self.permutations = self._init_permutations(seed, num_perm)
-
-    def minhash(self, shingle_count: int, shingles: Iterator[str]) -> np.array:
-        def generator():
-            for shingle in shingles:
-                yield TransformUtils.str_to_int(shingle)
-
-        hash_values = np.fromiter(generator(), dtype=np.uint64, count=shingle_count)
-
-        result = np.zeros(self.permutations.shape, dtype=np.uint32)
-        for i, perm in enumerate(self.permutations):
-            result[i] = np.right_shift((perm * hash_values).T, 32).astype(np.uint32).min(axis=0, keepdims=False)
-        return result
-
-    @staticmethod
-    def _init_permutations(seed: int, num_perm: int) -> np.array:
-        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
-        max_int = np.uint64((1 << 64) - 1)
-        gen = np.random.RandomState(seed)
-        # get self.num_perm pseudo random numbers between 2 and max_int (excl)
-        permutations = np.array([gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], dtype=np.uint64).T
-        # make all even pseudo random numbers odd by adding 1
-        permutations[permutations % 2 == 0] += 1
-        return permutations
-
-    @staticmethod
-    def jaccard(mh1: np.array, mh2: np.array) -> float:
-        return np.count_nonzero(mh1 == mh2)
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class DocCollector:
-    """
-    An actor collecting de duped document IDs
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Initializer
-        """
-        self.logger = get_logger(__name__)
-        self.actor_id = params.get("id")
-        self.removed = set()
-        data_access_factory = params.get("data_access")
-        self.data_access = data_access_factory.create_data_access()
-        snapshot = params.get("snapshot", None)
-        if snapshot is None:
-            self.ids = {}
-        else:
-            try:
-                bids, _ = self.data_access.get_file(snapshot)
-                self.ids = pickle.loads(bids)
-            except Exception as e:
-                self.logger.warning(f"Failed to load doc collector {self.actor_id} with exception {e}")
-                raise e
-
-    def add_documents(self, dr: tuple[list[tuple[int, int]], list[int]]) -> None:
-        """
-        Add documents and removed document
-        :param dr: documents to keep and documents to remove
-        :return:
-        """
-        docs = dr[0]
-        rm = dr[1]
-        # process documents to remove
-        for did in rm:
-            self.ids.pop(did, None)
-        self.removed.update(rm)
-        # process documents to keep
-        for key, val in docs:
-            if key in self.removed:
-                continue
-            if key in self.ids and val == NO_SIMILARITY:
-                # Do not update existing docs with NO_SIMILARITY
-                continue
-            else:
-                self.ids[key] = val
-
-    def filter(self, docs: list[int]) -> dict[int, int]:
-        """
-        Filter documents
-        :param docs: documents to filter
-        :return: documents to keep
-        """
-        result = {}
-        for doc_id in docs:
-            r = self.ids.get(doc_id, None)
-            if r is not None:
-                result[doc_id] = r
-        return result
-
-    def snapshot(self) -> None:
-        """
-        Snapshotting itself
-        """
-        try:
-            b_doc = pickle.dumps(self.ids)
-            self.data_access.save_file(
-                f"{SnapshotUtils.get_snapshot_folder(self.data_access)}docs/doc_collector_{self.actor_id}", b_doc
-            )
-        except Exception as e:
-            self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}")
-            raise e
-
-    def get_size(self) -> tuple[int, float, int, float]:
-        """
-        get sizes
-        :return: number of ids, its memory utilization, number of removed, its memory utilization
-        """
-        return (
-            len(self.ids),
-            TransformUtils.deep_get_size(self.ids) / GB,
-            len(self.removed),
-            TransformUtils.deep_get_size(self.removed) / GB,
-        )
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class DocsMinHash:
-    """
-    An actor storing min hashes for a doc id
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Initialize
-        :param params: parameters
-        """
-        self.logger = get_logger(__name__)
-        self.actor_id = params.get("id")
-        data_access_factory = params.get("data_access")
-        self.data_access = data_access_factory.create_data_access()
-        snapshot = params.get("snapshot", None)
-        if snapshot is None:
-            self.docs = {}
-        else:
-            try:
-                bdocs, _ = self.data_access.get_file(snapshot)
-                self.docs = pickle.loads(bdocs)
-            except Exception as e:
-                self.logger.warning(f"Failed to load minhash collector {self.actor_id} with exception {e}")
-                raise e
-
-    def add_minhashes(self, updates: list[tuple[int, int, np.array]]) -> None:
-        """
-        Add minhashes
-        :param updates: minhash for doc_id a tuple of doc len and array of hashes
-        :return: None
-        """
-        for doc_id, length, minhash in updates:
-            self.docs[doc_id] = np.concatenate(([length], minhash))
-
-    def get_minhashes(self, doc_ids: list[int]) -> list[tuple[int, int, np.array]]:
-        """
-        Get minhashes for a list of documents
-        :param doc_ids: list of doc ids
-        :return: doc id, len, minhashes
-        """
-        result = []
-        for doc_id in doc_ids:
-            info = self.docs.get(doc_id)
-            if info is not None:
-                result.append((doc_id, info[0], info[1:]))
-        return result
-
-    def snapshot(self) -> None:
-        """
-        Snapshotting itself
-        """
-        try:
-            b_doc = pickle.dumps(self.docs)
-            self.data_access.save_file(
-                f"{SnapshotUtils.get_snapshot_folder(self.data_access)}minhash/minhash_collector_{self.actor_id}",
-                b_doc,
-            )
-        except Exception as e:
-            self.logger.warning(f"Failed to snapshot minhash collector {self.actor_id} with exception {e}")
-            raise e
-
-    def get_size(self) -> tuple[int, float]:
-        """
-        Get size of used min hashes
-        :return: number of docs, its memory utilization
-        """
-        return len(self.docs), TransformUtils.deep_get_size(self.docs) / GB
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class BucketsHash:
-    """
-    Actor storing buckets information
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Initialization
-        """
-        from ray.util.metrics import Counter
-
-        self.submitter = None
-        self.n_buckets = 0
-        self.bucket_memory = 0
-        self.logger = get_logger(__name__)
-        self.actor_id = params.get("id")
-        data_access_factory = params.get("data_access")
-        self.data_access = data_access_factory.create_data_access()
-        snapshot = params.get("snapshot", None)
-        if snapshot is None:
-            self.buckets = {}
-        else:
-            try:
-                b_buckets, _ = self.data_access.get_file(snapshot)
-                self.buckets = pickle.loads(b_buckets)
-            except Exception as e:
-                self.logger.warning(f"Failed to load buckets collector {self.actor_id} with exception {e}")
-                raise e
-        self.bucket_created_counter = Counter("bucket_created", "Amount of buckets created")
-        self.long_bucket_submit_counter = Counter("long_bucket_submitted", "Amount of long buckets submitted")
-        self.short_bucket_submit_counter = Counter("short_bucket_submitted", "Amount of short buckets submitted")
-
-    def add_buckets(self, bck: list[tuple[int, list[int]]]) -> None:
-        """
-        Add additional buckets to hash
-        :param bck: bucket information
-        :return: None
-        """
-        for bucket in bck:
-            b_hash = bucket[0]
-            buckets_for_hash = self.buckets.get(b_hash)
-            if buckets_for_hash:
-                if type(buckets_for_hash) == int:
-                    self.buckets[b_hash] = [buckets_for_hash] + bucket[1]
-                else:
-                    buckets_for_hash.extend(bucket[1])
-            else:
-                if len(bucket[1]) == 1:
-                    self.buckets[b_hash] = bucket[1][0]
-                else:
-                    self.buckets[b_hash] = bucket[1]
-                self.bucket_created_counter.inc(1)
-
-    def add_processing_submitter(self, submitter: ActorHandle) -> None:
-        """
-        Add process submitter
-        :param submitter: reference to submitter
-        :return:
-        """
-        self.submitter = submitter
-
-    def process_buckets(self) -> None:
-        """
-        Process buckets to generate documents
-        :return: None
-        """
-
-        # Remember usage
-        self.n_buckets = len(self.buckets)
-        self.bucket_memory = TransformUtils.deep_get_size(self.buckets) / GB
-
-        # split buckets into short and long. Long buckets can take very long to process
-        long_buckets = []
-        short_buckets = []
-        while len(self.buckets) > 0:
-            doc_id, bucket = self.buckets.popitem()
-            if type(bucket) == list and len(bucket) > LONG_BUCKET:
-                # Its long
-                long_buckets.append(bucket)
-            else:
-                short_buckets.append(bucket)
-        self.logger.info(f"processing buckets {len(long_buckets)} long, {len(short_buckets)} short")
-
-        # process long buckets first - we are submitting them one at a time
-        for bucket in long_buckets:
-            if len(bucket) > 2 * LONG_BUCKET:
-                # For very long buckets, split them
-                self.logger.info(f"Splitting bucket of length len(bucket) into chunks")
-                smaller_bucket = [
-                    bucket[i * LONG_BUCKET : (i + 1) * LONG_BUCKET]
-                    for i in range((len(bucket) + LONG_BUCKET - 1) // LONG_BUCKET)
-                ]
-                for b in smaller_bucket:
-                    ray.get(self.submitter.submit_for_processing.remote([b]))
-                    self.long_bucket_submit_counter.inc(1)
-            else:
-                ray.get(self.submitter.submit_for_processing.remote([bucket]))
-                self.long_bucket_submit_counter.inc(1)
-        self.logger.info("Done submitting long buckets")
-
-        # And now the rest of buckets
-        bucket_chunks = [short_buckets[i * 100 : (i + 1) * 100] for i in range((len(short_buckets) + 99) // 100)]
-        for b in bucket_chunks:
-            ray.get(self.submitter.submit_for_processing.remote(b))
-            self.short_bucket_submit_counter.inc(len(b))
-
-    def snapshot(self) -> None:
-        """
-        Snapshotting itself
-        """
-        try:
-            b_buckets = pickle.dumps(self.buckets)
-            self.data_access.save_file(
-                f"{SnapshotUtils.get_snapshot_folder(self.data_access)}buckets/buckets_collector_{self.actor_id}",
-                b_buckets,
-            )
-        except Exception as e:
-            self.logger.warning(f"Failed to snapshot buckets collector {self.actor_id} with exception {e}")
-            raise e
-
-    def get_size(self) -> tuple[int, float]:
-        """
-        Get buckets resource utilization
-        :return: number of buckets and memory utilization
-        """
-        return self.n_buckets, self.bucket_memory
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class BucketsHashProcessor:
-    """
-    Actor for processing buckets
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Init method
-        :param params - dictionary of parameters containing the following keys
-            remote_docs - handles to the remote docs
-            remote_minhashes - handles to the remote minhashes
-            mn_min_hash - MurmurMH class
-            threshold - threshold
-            statistics - statistics actor
-        """
-        from ray.util.metrics import Counter
-
-        self.threshold = params["threshold"]
-        self.mn_min_hash = params["mn_min_hash"]
-        self.remote_docs = params["remote_docs"]
-        self.remote_minhashes = params["remote_minhashes"]
-        self.stats = params["statistics"]
-        self.logger = get_logger(__name__)
-        self.bucket_processed_counter = Counter("bucket_processed", "Amount of buckets processed")
-
-    def _submit_generated_docs(self, docs: dict[int, int], removed: set[int]) -> None:
-        """
-        Submit generated documents
-        :param docs: docs to submit
-        :param removed: removed documents
-        :return: None
-        """
-        # Remove doc ids that are already removed
-        for did in removed:
-            docs.pop(did, None)
-        # Build remote requests
-        request = [([], []) for _ in range(len(self.remote_docs))]
-        for key, value in docs.items():
-            req_tuple = request[key % len(self.remote_docs)]
-            req_tuple[0].append((key, value))
-        for did in removed:
-            req_tuple = request[did % len(self.remote_docs)]
-            req_tuple[1].append(did)
-        # Submit requests and wait for replies
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req[0]) > 0 or len(req[1]) > 0:  # Only submit if the request has data
-                remote_replies.append(self.remote_docs[i].add_documents.remote(req))
-            i += 1
-        # Process replies
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies)
-
-    # get minhashes and length for docs in the bucket
-    def _get_minhashes_docs(self, doc_ids: list[int]) -> dict[int, tuple[int, list[int]]]:
-        """
-        Get minhashes for documents by submitting requests to an appropriate doc collectors
-        :param doc_ids: doc ids
-        :return: doc ids with hashes
-        """
-        request = [[] for _ in range(len(self.remote_minhashes))]
-        for value in doc_ids:
-            request[value % len(self.remote_minhashes)].append(value)
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.remote_minhashes[i].get_minhashes.remote(req))
-            i += 1
-        # Process replies
-        hashes = {}
-        while remote_replies:
-            # Wait for replies
-            ready, not_ready = ray.wait(remote_replies)
-            reply = ray.get(ready)[0]
-            for r in reply:
-                hashes[r[0]] = (r[1], r[2])
-            remote_replies = not_ready
-        return hashes
-
-    def process_buckets(self, buckets: list[Union[int, list[int]]]) -> None:
-        """
-        process buckets to generate documents
-        :param buckets: buckets
-        :return: none
-        """
-        t_start = time.time()
-        docs = {}
-        removed = set()
-        for bucket in buckets:
-            if type(bucket) == int:
-                # This hash has a single document
-                if bucket not in docs:
-                    docs[bucket] = NO_SIMILARITY
-                self.bucket_processed_counter.inc(1)
-                continue
-            # multiple documents
-            start = time.time()
-            bucket_len = len(bucket)
-            very_long = bucket_len > LONG_BUCKET
-
-            hashes = self._get_minhashes_docs(bucket)
-            set_list = []
-            unvisited = set(bucket)
-
-            # combine similar documents
-            index = 0
-            while len(unvisited) > 0:
-                current_doc_id = unvisited.pop()
-                current_mh = hashes[current_doc_id][1]
-                current_set = set()
-                for other_doc_id in bucket:
-                    if other_doc_id in unvisited:
-                        other_mh = hashes[other_doc_id][1]
-                        if self.mn_min_hash.jaccard(current_mh, other_mh) >= self.threshold:
-                            current_set.add(current_doc_id)
-                            current_set.add(other_doc_id)
-                            unvisited.discard(other_doc_id)
-                if len(current_set) > 0:
-                    set_list.append(current_set)
-                index += 1
-                if index % LONG_BUCKET_PRINT == 0:
-                    self.logger.info(f"processing very long {bucket_len} bucket, {index} documents so far")
-            if index > LONG_BUCKET_PRINT:
-                self.logger.info(f"done processing very long {bucket_len}")
-
-            # process created sets
-            for current_set in set_list:
-                for d in current_set:
-                    bucket.remove(d)
-                removed.update(current_set)
-                for i, doc_id in enumerate(current_set):
-                    if i == 0:
-                        cluster_id = doc_id
-                        remaining = doc_id
-                        min_len = hashes[doc_id][0]
-                        max_len = min_len
-                        continue
-                    c_len = hashes[doc_id][0]
-                    if c_len > max_len:
-                        max_len = c_len
-                        remaining = doc_id
-                        continue
-                    if c_len <= min_len:
-                        min_len = c_len
-                        cluster_id = doc_id
-                docs[remaining] = cluster_id
-                removed.discard(remaining)
-
-            # if we did not find docs in connections, submit them as NO_SIMILARITY
-            for d in bucket:
-                if d not in docs:
-                    docs[d] = NO_SIMILARITY
-            if very_long:
-                self.logger.info(
-                    f"Processed long ({bucket_len}) bucket in {round((time.time() - start) / 60.,3)} "
-                    f"min; "
-                    f"docs chains {len(set_list)}"
-                )
-            self.bucket_processed_counter.inc(1)
-        # Submit docs
-        self._submit_generated_docs(docs, removed)
-        # peg stats
-        self.stats.add_stats.remote({"generated doc_ids": len(docs), "bucket processing time": time.time() - t_start})
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class BucketsHashProcessorInvoker(object):
-    """
-    Bucket hash processing coordinator (singleton)
-    """
-
-    def __init__(self, bucket_processors: list[ActorHandle]) -> None:
-        self.n_processors = len(bucket_processors)
-        self.pool = ActorPool(bucket_processors)
-        self.submitted = 0
-        self.processed = 0
-        self.logger = get_logger(__name__)
-        self.start = time.time()
-
-    def submit_for_processing(self, buckets: list[Union[int, list[int]]]) -> None:
-        # Get completed results
-        if self.submitted < self.n_processors:  # still have room
-            self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets)
-            self.logger.debug("Submitted bucket processing request")
-            self.submitted += 1
-            return
-        else:
-            while True:
-                # we can have several workers fail here
-                try:
-                    self.pool.get_next_unordered()
-                    break
-                except Exception as e:
-                    self.logger.error(f"Failed to process request worker exception {e}")
-                    self.processed += 1
-            self.processed += 1
-            if self.processed % 100 == 0:
-                self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min")
-            self.logger.debug("Completed bucket processing request")
-            self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets)
-            self.submitted += 1
-            self.logger.debug("Submitted bucket processing request")
-            return
-
-    def wait_for_completion(self) -> None:
-        self.logger.info(f"Waiting bucket processing completion. Submitted requests {self.submitted}")
-        while self.pool.has_next():
-            try:
-                self.pool.get_next_unordered()
-            except Exception as e:
-                self.logger.error(f"Failed to process request worker exception {e}")
-            self.processed += 1
-            if self.processed % 100 == 0:
-                self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min")
diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
index 6c6c02bb3..be1bf5fcb 100644
--- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
@@ -10,794 +10,67 @@
 # limitations under the License.
 ################################################################################
 
-import random
-import time
-from argparse import ArgumentParser, Namespace
-from typing import Any
-
-import mmh3
-import numpy as np
-import pyarrow as pa
-import ray
-from data_processing.data_access import DataAccessFactoryBase, SnapshotUtils
-from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from data_processing.utils import (
-    RANDOM_SEED,
-    CLIArgumentProvider,
-    TransformUtils,
-    str2bool,
-)
-from data_processing_ray.runtime.ray import (
-    DefaultRayTransformRuntime,
-    RayTransformFileProcessor,
-    RayTransformLauncher,
-    RayUtils,
+import argparse
+import os
+import sys
+
+from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from fdedup_transform_python import ServiceOrchestrator, parse_args
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
 )
-from data_processing_ray.runtime.ray.runtime_configuration import (
-    RayTransformRuntimeConfiguration,
+from get_duplicate_list_transform_ray import (
+    GetDuplicateListRayRuntime,
+    GetDuplicateListRayTransformConfiguration,
 )
-from fdedup_support import (
-    REQUEST_LEN,
-    BucketsHash,
-    BucketsHashProcessor,
-    BucketsHashProcessorInvoker,
-    DocCollector,
-    DocsMinHash,
-    MurmurMH,
-    fuzzy_optimal_param,
-)
-from ray.actor import ActorHandle
-from ray.util import ActorPool
-
-
-short_name = "fdedup"
-cli_prefix = f"{short_name}_"
-
-
-class FdedupTransform(AbstractTableTransform):
-    """
-    Implements fuzzy dedup data preprocessor (building tables and minhashes).
-    """
-
-    def __init__(self, config: dict):
-        """
-        Initialize based on the dictionary of configuration information.
-        :param config: initialization parameters, with the following keys
-            doc_column - name of doc column
-            doc_id_int_column - name of int doc id column
-            word_shingle_size - word shingle size
-            mn_min_hash - MurmurMH class
-            num_bands - number of bands
-            length_band band length
-            remote_buckets - bucket actors
-            remote_minhashes - minhash actors
-            delimiter - delimiter
-            random_delay_limit - random delay limit
-        """
-        super().__init__(config)
-        self.doc_column = config.get("doc_column", "")
-        self.doc_id_column = config.get("doc_id_int_column", "")
-        self.word_shingle_size = config.get("word_shingle_size", 1)
-        self.delimiter = config.get("delimiter", " ")
-        self.mn_min_hash = config.get("mn_min_hash", None)
-        self.num_bands = config.get("num_bands", 1)
-        self.length_band = config.get("length_band", 1)
-        self.buckets = config.get("remote_buckets", [])
-        self.minhashes = config.get("remote_minhashes", [])
-        self.random_delay_limit = config.get("random_delay_limit", 10)
-
-    def _generate_minhashes(self, shingles: list[str]) -> np.array:
-        """
-        Generate minhashes
-        :param shingles:
-        :return: generated minhashes
-        """
-        min_hashes = self.mn_min_hash.minhash(len(shingles), shingles)
-        num_min_hashes = len(min_hashes)
-        assert self.num_bands * self.length_band <= num_min_hashes, (
-            f"num_bans*band_len must be <= num min hashes, was num_bands={self.num_bands}, "
-            f"bands_len={self.length_band}, num_min hashes={num_min_hashes}"
-        )
-        return min_hashes
-
-    def _generate_buckets(self, min_hashes: np.array) -> list[int]:
-        """
-        Generate buckets
-        :param min_hashes: array of minhashes
-        :return:
-        """
-        return [
-            mmh3.hash64(min_hashes[i * self.length_band : (i + 1) * self.length_band], seed=RANDOM_SEED, signed=False)[
-                0
-            ]
-            for i in range(self.num_bands)
-        ]
-
-    def _submit_buckets_minhashes(
-        self, buckets: dict[int, list[int]], minhashes: list[tuple[int, int, np.array]]
-    ) -> None:
-        """
-        Submit buckets to hash
-        :param buckets: buckets
-        :param minhashes: minhashes
-        :return: None
-        """
-        # bucket requests
-        request = [[] for _ in range(len(self.buckets))]
-        for key, value in buckets.items():
-            request[key % len(self.buckets)].append((key, value))
-        # Submit requests to appropriate bucket collectors
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.buckets[i].add_buckets.remote(req))
-            i += 1
-        # Minhashes
-        request = [[] for _ in range(len(self.minhashes))]
-        for minh in minhashes:
-            request[minh[0] % len(self.minhashes)].append(minh)
-        # Submit requests to appropriate minhash collectors
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.minhashes[i].add_minhashes.remote(req))
-            i += 1
-        # wait for completion
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies)
-
-    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
-        """
-        Preprocessing table content.
-        :param table: table
-        :param file_name - name of currently processed file
-        :return: resulting table, statistics
-        """
-        from compute_shingles import compute_shingles
-
-        def flush(limit: int) -> None:
-            """
-            flushing buckets and minhashes to dedicated actors
-            :param limit: number of buckets to flush
-            :return: None
-            """
-            if len(buckets) >= limit:  # time to submit
-                nonlocal num_buckets
-                nonlocal num_minhashes
-                self._submit_buckets_minhashes(buckets, minhashes)
-                num_buckets = num_buckets + len(buckets)
-                num_minhashes = num_minhashes + len(minhashes)
-                buckets.clear()
-                minhashes.clear()
-
-        # make sure that the doc column exists
-        TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column])
-        # Inner variables
-        buckets = {}
-        minhashes = []
-        num_buckets = 0
-        num_minhashes = 0
-        docs = table[self.doc_column]
-        doc_ids = table[self.doc_id_column]
-        # for every document/its integer id
-        for n in range(table.num_rows):
-            doc = docs[n].as_py()
-            doc_id = doc_ids[n].as_py()
-            shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter)
-            if len(shingles) > 0:
-                mh = self._generate_minhashes(shingles)
-                minhashes.append((doc_id, len(doc), mh))
-                candidates = self._generate_buckets(mh)
-
-                for b_hash in candidates:
-                    bucket_array = buckets.get(b_hash)
-                    if bucket_array is None:
-                        buckets[b_hash] = [doc_id]
-                    else:
-                        bucket_array.append(doc_id)
-                flush(REQUEST_LEN)
-        flush(0)
-        # peg stats
-        stats = {"generated buckets": num_buckets, "generated minhashes": num_minhashes}
-        time.sleep(int(random.random() * self.random_delay_limit))
-        return [], stats
-
-
-class FdedupFilter(AbstractTableTransform):
-    """
-    Filtering documents
-    """
-
-    def __init__(self, config: dict):
-        """
-        Initialize based on the dictionary of configuration information.
-        The dictionary should contain the following:
-            doc_column - name of doc column
-            doc_id_int_column - name of int doc id column
-            cluster_column - name of the cluster column
-            remote_docs - list of remote doc collectors
-            random_delay_limit - random delay limit
-        """
-        super().__init__(config)
-        self.doc_column = config.get("doc_column", "")
-        self.doc_id_column = config.get("doc_id_int_column", "")
-        self.cluster_column = config.get("cluster_column", "")
-        self.docs = config.get("remote_docs", "")
-        self.random_delay_limit = config.get("random_delay_limit", 10)
-
-    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
-        """
-        De duping (filtering) table content.
-        :param table: table
-        :param file_name: name of the currently processing file
-        :return: resulting table, statistics
-        """
-        # make sure that the doc column exists
-        TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column])
-        # inner variables
-        ids = table.column(self.doc_id_column)
-        # Submit requests to an appropriate doc collectors
-        request = [[] for _ in range(len(self.docs))]
-        for value in ids:
-            doc_id = value.as_py()
-            request[doc_id % len(self.docs)].append(doc_id)
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.docs[i].filter.remote(req))
-            i += 1
-        # Process replies
-        unique = {}
-        while remote_replies:
-            # Wait for replies
-            ready, not_ready = ray.wait(remote_replies)
-            reply = ray.get(ready)[0]
-            unique.update(reply)
-            remote_replies = not_ready
-        # Filter out table
-        mask = []
-        clusters = []
-        # Actual filtering
-        for n in range(table.num_rows):
-            doc_id = ids[n].as_py()
-            if doc_id in unique:
-                mask.append(True)
-                clusters.append(unique.pop(doc_id))
-            else:
-                mask.append(False)
-        # build out table
-        out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters)
-        # build execution statistics
-        stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows}
-        time.sleep(int(random.random() * self.random_delay_limit))
-        return [out_table], stats
-
-
-class FdedupRuntime(DefaultRayTransformRuntime):
-    """
-    Fuzzy dedup runtime support. Here we are using set environment to implement first two steps of fuzzy dedup
-    processing - preprocessing and bucket hash processing
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Create filter runtime
-        :param params: parameters, that should include
-            doc_column - name of the document column
-            id_column - name of the integer doc id column
-            cluster_column - name of the cluster column
-            worker_options - start options for preprocessor - from the orchestrator configuration
-            bucket_cpu - number of cpus for bucket actor
-            doc_cpu - number of cpus for doc actor
-            mhash_cpu - number of cpus for minhash actor
-            num_doc_actors - number of document actors
-            num_bucket_actors - number of bucket actors
-            num_minhash_actors - number of minhash actors
-            num_preprocessors - number of preprocessors
-            snapshot_delay - delay (sec) in sending snapshot requests to actors
-            use_bucket_snapshot - use bucket snapshot
-            use_doc_snapshot - use doc snapshot
-            random_delay_limit - random_delay limit
-            # fuzzy specific parameters
-            num_permutations - number of permutations
-            threshold - threshold
-            world_shingle_size - word shingles size
-            delimiters - delimiter
-        """
-        from data_processing.utils import get_logger
-
-        super().__init__(params)
-        self.logger = get_logger(__name__)
-        self.sum_buckets = 0
-        self.sum_buckets_mem = 0
-        self.sum_mh = 0
-        self.sum_mh_mem = 0
-        self.document_collectors = []
-        self.snapshot_delay = self.params.get("snapshot_delay", 1)
-        self.random_delay_limit = self.params.get("random_delay_limit", 10)
-
-    def get_transform_config(
-        self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
-    ) -> dict[str, Any]:
-        """
-        Set environment for filter execution
-        :param data_access_factory - data access factory
-        :param statistics - reference to the statistics object
-        :param files - list of files to process
-        :return: dictionary of filter init params
-        """
-        if self.params.get("use_doc_snapshot", False):
-            self.logger.info("continuing from the document actors snapshot")
-            data_access = data_access_factory.create_data_access()
-            path = f"{SnapshotUtils.get_snapshot_folder(data_access)}docs"
-            files, retries = data_access.get_folder_files(path=path)
-            if retries > 0:
-                statistics.add_stats.remote({"data access retries": retries})
-            self.logger.info(f"Found the following snapshot files {files.keys()}")
-            self.document_collectors = [None] * len(files)
-            for file in files.keys():
-                i = int(file[file.rfind("_") + 1 :])
-                self.document_collectors[i] = DocCollector.options(
-                    **{"num_cpus": self.params.get("doc_cpu", 0.5)}
-                ).remote({"id": i, "data_access": data_access_factory, "snapshot": file})
-                time.sleep(self.snapshot_delay)
-            self.logger.info(f"Created {len(self.document_collectors)} document collectors to continue processing")
-        else:
-            self.logger.info("starting run from the beginning")
-            self._create_doc_actors(data_access_factory=data_access_factory, statistics=statistics, files=files)
-        return {
-            "doc_column": self.params.get("doc_column", ""),
-            "doc_id_int_column": self.params.get("id_column", ""),
-            "cluster_column": self.params.get("cluster_column", ""),
-            "remote_docs": self.document_collectors,
-            "random_delay_limit": self.random_delay_limit,
-        }
-
-    def _create_doc_actors(
-        self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
-    ) -> None:
-        """
-        Create document actors
-        :param data_access_factory - data access factory
-        :param statistics - reference to the statistics object
-        :param files - list of files to process
-        :return: None
-        """
-        mn_min_hash = MurmurMH(num_perm=self.params.get("num_permutations", 64), seed=RANDOM_SEED)
-        if self.params.get("use_bucket_snapshot", False):
-            self.logger.info("continuing from the bucket actors snapshot")
-            data_access = data_access_factory.create_data_access()
-            # recreate bucket collectors
-            path = f"{SnapshotUtils.get_snapshot_folder(data_access)}buckets"
-            files, retries = data_access.get_folder_files(path=path)
-            if retries > 0:
-                statistics.add_stats.remote({"data access retries": retries})
-            self.logger.debug(f"Found the following bucket snapshot files {files.keys()}")
-            bucket_collectors = [None] * len(files)
-            for file in files.keys():
-                i = int(file[file.rfind("_") + 1 :])
-                bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote(
-                    {"id": i, "data_access": data_access_factory, "snapshot": file}
-                )
-                time.sleep(self.snapshot_delay)
-            self.logger.info(f"Created {len(bucket_collectors)} bucket collectors to continue processing")
-            # recreate minhash collectors
-            path = f"{SnapshotUtils.get_snapshot_folder(data_access)}minhash"
-            files, retries = data_access.get_folder_files(path=path)
-            if retries > 0:
-                statistics.add_stats.remote({"data access retries": retries})
-            self.logger.debug(f"Found the following minhash snapshot files {files.keys()}")
-            minhash_collectors = [None] * len(files)
-            for file in files.keys():
-                i = int(file[file.rfind("_") + 1 :])
-                minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote(
-                    {"id": i, "data_access": data_access_factory, "snapshot": file}
-                )
-                time.sleep(self.snapshot_delay)
-            self._process_buckets(
-                data_access_factory=data_access_factory,
-                statistics=statistics,
-                bucket_collectors=bucket_collectors,
-                minhash_collectors=minhash_collectors,
-                mn_min_hash=mn_min_hash,
-            )
-            self.logger.info(f"Created {len(minhash_collectors)} minhash collectors to continue processing")
-        else:
-            self.logger.info("continuing from the very beginning")
-            self._create_doc_actors_internal(
-                data_access_factory=data_access_factory, statistics=statistics, mn_min_hash=mn_min_hash, files=files
-            )
-
-    def _create_doc_actors_internal(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        statistics: ActorHandle,
-        mn_min_hash: MurmurMH,
-        files: list[str],
-    ) -> None:
-        """
-        Create document actors
-        :param data_access_factory - data access factory
-        :param statistics - reference to the statistics object
-        :param mn_min_hash - MurmurMH class
-        :param files - list of files to process
-        :return: None
-        """
-        # compute fuzzy dedup parameters
-        num_buckets, length_bucket = fuzzy_optimal_param(
-            threshold=self.params.get("threshold", 0.8),
-            num_perm=self.params.get("num_permutations", 64),
-            false_positive_weight=0.5,
-            false_negative_weight=0.5,
-        )
-        self.logger.info(f"Fuzzy: num buckets {num_buckets}, bucket length {length_bucket}")
-        # Build bucket and minhash collectors
-        bucket_collectors = [None] * self.params.get("num_bucket_actors", 1)
-        for i in range(self.params.get("num_bucket_actors", 1)):
-            bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote(
-                {"id": i, "data_access": data_access_factory}
-            )
-        self.logger.info(f"created {len(bucket_collectors)} bucket actors")
-        minhash_collectors = [None] * self.params.get("num_minhash_actors", 1)
-        for i in range(self.params.get("num_minhash_actors", 1)):
-            minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote(
-                {"id": i, "data_access": data_access_factory}
-            )
-        self.logger.info(f"created {len(minhash_collectors)} minhash actors")
-        self._preprocess_tables(
-            data_access_factory=data_access_factory,
-            statistics=statistics,
-            files=files,
-            mn_min_hash=mn_min_hash,
-            num_buckets=num_buckets,
-            length_bucket=length_bucket,
-            bucket_collectors=bucket_collectors,
-            minhash_collectors=minhash_collectors,
-            random_delay_limit=self.random_delay_limit,
-        )
-        # At this point we can snapshot both bucket and minhash collectors for potential restart
-        self.logger.info("creating minhash snapshots")
-        minhash_replies = [None] * len(minhash_collectors)
-        index = 0
-        for collector in minhash_collectors:
-            minhash_replies[index] = collector.snapshot.remote()
-            index += 1
-            time.sleep(self.snapshot_delay)
-        while minhash_replies:
-            ready, not_ready = ray.wait(minhash_replies)
-            minhash_replies = not_ready
-        self.logger.info("minhash snapshots created")
-        self.logger.info("creating bucket snapshots")
-        bucket_replies = [None] * len(bucket_collectors)
-        index = 0
-        for collector in bucket_collectors:
-            bucket_replies[index] = collector.snapshot.remote()
-            index += 1
-            time.sleep(self.snapshot_delay)
-        while bucket_replies:
-            ready, not_ready = ray.wait(bucket_replies)
-            bucket_replies = not_ready
-        self.logger.info("bucket snapshots created")
-        self._process_buckets(
-            data_access_factory=data_access_factory,
-            statistics=statistics,
-            bucket_collectors=bucket_collectors,
-            minhash_collectors=minhash_collectors,
-            mn_min_hash=mn_min_hash,
-        )
-
-    def _process_buckets(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        statistics: ActorHandle,
-        bucket_collectors: list[ActorHandle],
-        minhash_collectors: list[ActorHandle],
-        mn_min_hash: MurmurMH,
-    ) -> None:
-        """
-        Process buckets
-        :param data_access_factory - data access factory
-        :param statistics - statistics actor
-        :param bucket_collectors - bucket collectors
-        :param minhash_collectors - minhash collectors
-        :param mn_min_hash - MMurmurMH class
-        :return: None
-        """
-        # Create document collectors
-        self.document_collectors = [None] * self.params.get("num_doc_actors", 1)
-        for i in range(self.params.get("num_doc_actors", 1)):
-            self.document_collectors[i] = DocCollector.options(**{"num_cpus": self.params.get("doc_cpu", 0.5)}).remote(
-                {"id": i, "data_access": data_access_factory}
-            )
-        self.logger.info(f"created {len(self.document_collectors)} document actors")
-        # create bucket processors
-        bucket_processors_list = RayUtils.create_actors(
-            clazz=BucketsHashProcessor,
-            params={
-                "remote_docs": self.document_collectors,
-                "remote_minhashes": minhash_collectors,
-                "mn_min_hash": mn_min_hash,
-                "threshold": self.params.get("threshold", 0.8) * self.params.get("num_permutations", 64),
-                "statistics": statistics,
-            },
-            actor_options=self.params.get("worker_options", None),
-            n_actors=self.params.get("num_preprocessors", 1),
-        )
-        self.logger.info(f"created {len(bucket_processors_list)} bucket processor actors")
-        # create bucket processors invoker
-        bucket_processor_invoker = BucketsHashProcessorInvoker.options(
-            num_cpus=self.params.get("bucket_cpu", 0.5)
-        ).remote(bucket_processors=bucket_processors_list)
-        self.logger.info(f"created bucket processor invoker")
-        # Add invoker to the buckets
-        bucket_replies = [
-            collector.add_processing_submitter.remote(submitter=bucket_processor_invoker)
-            for collector in bucket_collectors
-        ]
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies)
-        self.logger.info(f"added invoker to bucket collectors")
-        # start bucket processing and wait for completion
-        start = time.time()
-        bucket_replies = [collector.process_buckets.remote() for collector in bucket_collectors]
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies)
-        # Wait for pool to complete
-        ray.get(bucket_processor_invoker.wait_for_completion.remote())
-        self.logger.info(f"Done processing buckets in {round((time.time() - start) / 60.,3)} min")
-        # At this point we can save doc actors, in case we would want to restart here
-        self.logger.info(f"creating document snapshots")
-        doc_replies = [None] * len(self.document_collectors)
-        index = 0
-        for collector in self.document_collectors:
-            doc_replies[index] = collector.snapshot.remote()
-            index += 1
-            time.sleep(self.snapshot_delay)
-        while doc_replies:
-            ready, not_ready = ray.wait(doc_replies)
-            doc_replies = not_ready
-        self.logger.info(f"document snapshots created")
-        # At this point we do not need bucket and minhash actors, remove them
-        # but first get usage information
-        # Bucket collector
-        replies = [collector.get_size.remote() for collector in bucket_collectors]
-        while replies:
-            ready, not_ready = ray.wait(replies)
-            b_amount, b_memory = ray.get(ready)[0]
-            self.sum_buckets += b_amount
-            self.sum_buckets_mem += b_memory
-            replies = not_ready
-        for collector in bucket_collectors:
-            ray.kill(actor=collector, no_restart=True)
-        # minhash collector
-        replies = [collector.get_size.remote() for collector in minhash_collectors]
-        while replies:
-            ready, not_ready = ray.wait(replies)
-            m_amount, m_memory = ray.get(ready)[0]
-            self.sum_mh += m_amount
-            self.sum_mh_mem += m_memory
-            replies = not_ready
-        for collector in minhash_collectors:
-            ray.kill(actor=collector, no_restart=True)
-        # Clean up processors
-        for processor in bucket_processors_list:
-            ray.kill(actor=processor, no_restart=True)
-        ray.kill(bucket_processor_invoker)
-
-    def _preprocess_tables(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        statistics: ActorHandle,
-        files: list[str],
-        mn_min_hash: MurmurMH,
-        num_buckets: int,
-        length_bucket: int,
-        bucket_collectors: list[ActorHandle],
-        minhash_collectors: list[ActorHandle],
-        random_delay_limit: int,
-    ) -> None:
-        """
-        Preprocess tables - build, run and cleanup
-        :param data_access_factory - data access factory
-        :param statistics - statistics actor
-        :param files - list of files to process
-        :param mn_min_hash - MurmurMH class
-        :param num_buckets - number of buckets
-        :param length_bucket - bucket length
-        :param bucket_collectors - bucket collector actors
-        :param minhash_collectors - minhash_collector actors
-        :param random_delay_limit - max for random dalay limit
-        :return: None
-        """
-        from ray.util.metrics import Gauge
-
-        worker_options = self.params.get("worker_options", None)
-        # Here we are limiting the number of readers not to overwhelm COS
-        n_readers = self.params.get("num_preprocessors", 1)
-        if n_readers > 1000:
-            n_readers = 1000
-        self.logger.info(f"Table preprocessing uses {n_readers} readers")
-        # Create preprocessing actors
-        processor_params = {
-            "data_access_factory": data_access_factory,
-            "transform_class": FdedupTransform,
-            "statistics": statistics,
-            "transform_params": {
-                "doc_column": self.params.get("doc_column", ""),
-                "doc_id_int_column": self.params.get("id_column", ""),
-                "word_shingle_size": self.params.get("world_shingle_size", 1),
-                "mn_min_hash": mn_min_hash,
-                "num_bands": num_buckets,
-                "length_band": length_bucket,
-                "remote_buckets": bucket_collectors,
-                "remote_minhashes": minhash_collectors,
-                "delimiter": self.params.get("delimiter", " "),
-                "random_delay_limit": random_delay_limit,
-            },
-            "base_table_stats": False,
-        }
-        processors_list = RayUtils.create_actors(
-            clazz=RayTransformFileProcessor,
-            params=processor_params,
-            actor_options=worker_options,
-            n_actors=n_readers,
-        )
-        self.logger.info(f"created {len(processors_list)} table processor actors")
-        # Execute preprocessing
-        # create gauges
-        files_in_progress_gauge = Gauge(
-            "preprocessing_files_in_progress", "Number of files in progress, preprocessing"
-        )
-        files_completed_gauge = Gauge(
-            "preprocessing_files_processed_total", "Number of files completed, preprocessing"
-        )
-        available_cpus_gauge = Gauge("preprocessing_available_cpus", "Number of available CPUs, preprocessing")
-        available_gpus_gauge = Gauge("preprocessing_available_gpus", "Number of available GPUs, preprocessing")
-        available_memory_gauge = Gauge("preprocessing_available_memory", "Available memory, preprocessing")
-        available_object_memory_gauge = Gauge(
-            "preprocessing_available_object_store", "Available object store, preprocessing"
-        )
-        print_interval = int(len(files) / 100)
-        if print_interval == 0:
-            print_interval = 1
-        # process data
-        processors = ActorPool(processors_list)
-        failures = RayUtils.process_files(
-            executors=processors,
-            files=files,
-            print_interval=print_interval,
-            files_in_progress_gauge=files_in_progress_gauge,
-            files_completed_gauge=files_completed_gauge,
-            available_cpus_gauge=available_cpus_gauge,
-            available_gpus_gauge=available_gpus_gauge,
-            available_memory_gauge=available_memory_gauge,
-            object_memory_gauge=available_object_memory_gauge,
-            logger=self.logger,
-        )
-        if failures > 0:
-            statistics.add_stats.remote({"actor failures": failures})
-        # Clean up processors
-        for processor in processors_list:
-            ray.kill(actor=processor, no_restart=True)
-        del processors
-
-    def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
-        """
-        Compute execution statistics
-        :param stats: output of statistics
-        :return: job execution statistics
-        """
-        # Get document collector statistics
-        sum_docs = 0
-        sum_docs_mem = 0
-        sum_removed = 0
-        sum_removed_mem = 0
-        replies = [collector.get_size.remote() for collector in self.document_collectors]
-        while replies:
-            ready, not_ready = ray.wait(replies)
-            d_amount, d_memory, r_amount, r_memory = ray.get(ready)[0]
-            sum_docs += d_amount
-            sum_docs_mem += d_memory
-            sum_removed += r_amount
-            sum_removed_mem += r_memory
-            replies = not_ready
-        overall_hash_memory = self.sum_buckets_mem + self.sum_mh_mem + sum_docs_mem + sum_docs_mem + sum_removed_mem
-        dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1))
-        return {
-            "number of buckets": self.sum_buckets,
-            "number of docs": sum_docs,
-            "number of removed docs": sum_removed,
-            "number of min hashes": self.sum_mh,
-            "overall hash memory GB": overall_hash_memory,
-            "de duplication %": dedup_prst,
-        } | stats
-
+from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
 
-class FdedupTableTransformConfiguration(TransformConfiguration):
-    """
-    Provides support for configuring and using the associated Transform class include
-    configuration with CLI args and combining of metadata.
-    """
 
-    def __init__(self):
-        super().__init__(
-            name=short_name,
-            transform_class=FdedupFilter,
-        )
-        from data_processing.utils import get_logger
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
 
-        self.logger = get_logger(__name__)
 
-    def add_input_params(self, parser: ArgumentParser) -> None:
-        """
-        Add Transform-specific arguments to the given  parser.
-        """
-        parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="document column name")
-        parser.add_argument(
-            f"--{cli_prefix}id_column", type=str, default="int_document_id", help="integer document id column name"
-        )
-        parser.add_argument(f"--{cli_prefix}cluster_column", type=str, default="cluster", help="cluster column name")
-        parser.add_argument(
-            f"--{cli_prefix}bucket_cpu", type=float, default=0.5, help="number of CPUs per bucket hash"
-        )
-        parser.add_argument(
-            f"--{cli_prefix}mhash_cpu", type=float, default=0.5, help="number of CPUs per minhash hash"
-        )
-        parser.add_argument(f"--{cli_prefix}doc_cpu", type=float, default=0.5, help="number of CPUs per doc hash")
-        parser.add_argument(f"--{cli_prefix}num_doc_actors", type=int, default=1, help="number of doc actors to use")
-        parser.add_argument(
-            f"--{cli_prefix}num_minhash_actors", type=int, default=1, help="number of minhash actors to use"
-        )
-        parser.add_argument(
-            f"--{cli_prefix}num_bucket_actors", type=int, default=1, help="number of bucket actors to use"
-        )
-        parser.add_argument(
-            f"--{cli_prefix}num_preprocessors", type=int, default=1, help="number of preprocessors to use"
-        )
-        parser.add_argument(f"--{cli_prefix}num_permutations", type=int, default=64, help="number of permutations")
-        parser.add_argument(f"--{cli_prefix}threshold", type=float, default=0.8, help="threshold")
-        parser.add_argument(f"--{cli_prefix}shingles_size", type=int, default=5, help="number of words in shingle")
-        parser.add_argument(
-            f"--{cli_prefix}delimiters", type=str, default=" ", help="delimiter for splitting document"
-        )
-        parser.add_argument(f"--{cli_prefix}snapshot_delay", type=int, default=1, help="snapshot delay time")
-        parser.add_argument(
-            f"--{cli_prefix}use_bucket_snapshot",
-            type=lambda x: bool(str2bool(x)),
-            default=False,
-            help="flag to continue with bucket snapshot",
-        )
-        parser.add_argument(
-            f"--{cli_prefix}use_doc_snapshot",
-            type=lambda x: bool(str2bool(x)),
-            default=False,
-            help="flag to continue with doc snapshot",
-        )
-        parser.add_argument(
-            f"--{cli_prefix}random_delay_limit", type=int, default=10, help="maximum delay between read"
-        )
+ray_worker_options = {"num_cpus": 0.8}
+ray_params = {
+    # where to run
+    "run_locally": True,
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(ray_worker_options),
+    "runtime_num_workers": 3,
+}
 
-    def apply_input_params(self, args: Namespace) -> bool:
-        """
-        Validate and apply the arguments that have been parsed
-        :param args: user defined arguments.
-        :return: True, if validate pass or False otherwise
-        """
-        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
-        self.params = self.params | captured
-        self.params["worker_options"] = args.runtime_worker_options
-        if self.params["use_bucket_snapshot"] and self.params["use_doc_snapshot"]:
-            self.logger.warning("both bucket and doc snapshot are specified. Only one allowed")
-            return False
+ray_params_argv = ParamsUtils.dict_to_req(ray_params)
 
-        self.logger.info(f"fuzzy dedup params are {self.params}")
-        return True
 
+class RayServiceOrchestrator(ServiceOrchestrator):
+    def __init__(self, global_params: argparse.Namespace = None):
+        super().__init__(global_params=global_params)
 
-class FdedupRayTransformConfiguration(RayTransformRuntimeConfiguration):
-    def __init__(self):
-        super().__init__(transform_config=FdedupTableTransformConfiguration(), runtime_class=FdedupRuntime)
+    def execute_service(self, service_short_name: str, params: list) -> int:
+        sys.argv = params if service_short_name == "fdlist" else ray_params_argv + params[1:]
+        if service_short_name == "minhash":
+            launcher = RayTransformLauncher(runtime_config=SignatureCalculationRayTransformConfiguration())
+        elif service_short_name == "cluster":
+            launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration())
+        elif service_short_name == "fdlist":
+            launcher = RayTransformLauncher(runtime_config=GetDuplicateListRayTransformConfiguration())
+        elif service_short_name == "fdclean":
+            launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration())
+        status = launcher.launch()
+        return status
 
 
 if __name__ == "__main__":
-    launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
-    launcher.launch()
+    # Parse command line arguments
+    args = parse_args()
+    # Initialize the orchestrator
+    orchestrator = RayServiceOrchestrator(global_params=args)
+    # Launch ray fuzzy dedup execution
+    orchestrator.orchestrate()
diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py
new file mode 100644
index 000000000..40081e658
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py
@@ -0,0 +1,69 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+from data_processing.data_access import DataAccess
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray import (
+    DefaultRayTransformRuntime,
+    RayTransformLauncher,
+    RayTransformRuntimeConfiguration,
+)
+from get_duplicate_list_transform import (
+    GetDuplicateListTransformConfiguration,
+    subfolder_key,
+)
+
+
+logger = get_logger(__name__)
+
+
+class GetDuplicateListRayRuntime(DefaultRayTransformRuntime):
+    """
+    Get duplicate list runtime support for Ray
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        return [self.params[subfolder_key]]
+
+
+class GetDuplicateListRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for Fuzzy Dedup Get Duplicate List
+    as required by the RayTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(
+            transform_config=GetDuplicateListTransformConfiguration(),
+            runtime_class=GetDuplicateListRayRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration())
+    logger.info("Launching fuzzy dedup get duplicate list ray transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
new file mode 100644
index 000000000..cb87b56af
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
@@ -0,0 +1,54 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+worker_options = {"num_cpus": 0.8}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # where to run
+    "run_locally": True,
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
+    "runtime_num_workers": 3,
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_creation_delay": 0,
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    # execution info
+    "minhash_num_permutations": 112,
+    "minhash_num_bands": 14,
+    "minhash_num_segments": 2,
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
new file mode 100644
index 000000000..678d953f2
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
@@ -0,0 +1,43 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray.runtime_configuration import (
+    RayTransformRuntimeConfiguration,
+)
+from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher
+from signature_calc_transform import SignatureCalculationTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class SignatureCalculationRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(transform_config=SignatureCalculationTransformConfiguration())
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration())
+    logger.info("Launching  transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
new file mode 100644
index 000000000..79fe53b62
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
new file mode 100644
index 000000000..9df2f3bd5
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
new file mode 100644
index 000000000..f5da05a10
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
new file mode 100644
index 000000000..0e089dee3
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
new file mode 100644
index 000000000..4b0fecb15
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
new file mode 100644
index 000000000..5601f5cb0
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
new file mode 100644
index 000000000..02bedff1c
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
new file mode 100644
index 000000000..bf131f43c
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
new file mode 100644
index 000000000..d41b35de2
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
new file mode 100644
index 000000000..06b4b7467
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
new file mode 100644
index 000000000..ca5323db5
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
new file mode 100644
index 000000000..2838dd972
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
new file mode 100644
index 000000000..7cb2cbac4
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
new file mode 100644
index 000000000..79fe53b62
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
new file mode 100644
index 000000000..9de625746
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
new file mode 100644
index 000000000..9df2f3bd5
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
new file mode 100644
index 000000000..8e1fe121e
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
new file mode 100644
index 000000000..37aea5168
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
new file mode 100644
index 000000000..3d1f158e9
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
new file mode 100644
index 000000000..ca5323db5
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
new file mode 100644
index 000000000..06b4b7467
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
new file mode 100644
index 000000000..c08326355
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -0,0 +1,58 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "cluster",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:32:15",
+        "end_time": "2024-10-18 10:32:15",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "jaccard_similarity_threshold": 0.7,
+        "num_bands": 14,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 91.7,
+        "gpus": 0,
+        "memory": 24.01,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 28,
+        "result_size": 38040,
+        "processing_time": 0.061,
+        "input_files": 28,
+        "input_bytes": 115324,
+        "input_rows": 168,
+        "consolidated_files": 28,
+        "consolidated_bytes": 80640,
+        "consolidated_rows": 168,
+        "groupby_clusters": 35,
+        "cluster_duplicate_docs": 79,
+        "jaccard_clusters": 35,
+        "jaccard_duplicate_docs": 44,
+        "num_duplicate_documents": 44
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet
new file mode 100644
index 000000000..03a0c321a
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json
new file mode 100644
index 000000000..047921334
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json
@@ -0,0 +1,56 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "spark",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:38",
+        "end_time": "2024-10-14 10:43:55",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "annotate",
+        "RDD parallelization": -1,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"]
+    },
+    "execution_stats": {
+        "num partitions": 20,
+        "execution time, min": 0.284,
+        "cpus": 20,
+        "gpus": 0,
+        "memory": 0.36,
+        "object_store": 0
+    },
+    "job_output_stats": {
+        "source_size": 4111,
+        "output_bytes": 8856,
+        "processing_time": 0.46729254722595215,
+        "input_bytes": 8753,
+        "result_size": 6923,
+        "input_files": 1,
+        "source_files": 1,
+        "input_docs": 12,
+        "output_docs": 12,
+        "filtered_docs": 0,
+        "output_files": 1,
+        "result_files": 1,
+        "source_doc_count": 12,
+        "filtered_bytes": -103,
+        "result_doc_count": 12
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
new file mode 100644
index 000000000..d67b5bcf8
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
new file mode 100644
index 000000000..267e78385
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json
new file mode 100644
index 000000000..717d9bbe9
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:10:22",
+        "end_time": "2024-10-18 10:10:23",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 112.7,
+        "gpus": 0,
+        "memory": 24.17,
+        "object_store": 0,
+        "execution time, min": 0.005
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.308,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 000000000..8aa870c00
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 000000000..34b15a76c
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json
new file mode 100644
index 000000000..d4cd3e362
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:49:10",
+        "end_time": "2024-10-18 10:49:10",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 101.1,
+        "gpus": 0,
+        "memory": 24.02,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.007,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/metadata.json
index 4a1b54395..a0b26f931 100644
--- a/transforms/universal/fdedup/ray/test-data/expected/metadata.json
+++ b/transforms/universal/fdedup/ray/test-data/expected/metadata.json
@@ -2,86 +2,48 @@
     "pipeline": "pipeline_id",
     "job details": {
         "job category": "preprocessing",
-        "job name": "fdedup",
-        "job type": "ray",
+        "job name": "fdlist",
+        "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-06-24 19:39:44",
-        "end_time": "2024-06-24 19:39:57",
+        "start_time": "2024-10-18 11:36:37",
+        "end_time": "2024-10-18 11:36:37",
         "status": "success"
     },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
+    "code": null,
     "job_input_params": {
-        "doc_column": "contents",
-        "id_column": "int_id_column",
-        "cluster_column": "cluster",
-        "bucket_cpu": 0.5,
-        "mhash_cpu": 0.5,
-        "doc_cpu": 0.5,
-        "num_doc_actors": 1,
-        "num_minhash_actors": 1,
-        "num_bucket_actors": 1,
-        "num_preprocessors": 2,
-        "num_permutations": 64,
-        "threshold": 0.8,
-        "shingles_size": 5,
-        "delimiters": " ",
-        "snapshot_delay": 1,
-        "use_bucket_snapshot": false,
-        "use_doc_snapshot": false,
-        "random_delay_limit": 5,
-        "worker_options": {
-            "num_cpus": 0.8,
-            "max_restarts": -1
-        },
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "sort_output": false,
         "checkpointing": false,
         "max_files": -1,
         "random_samples": -1,
         "files_to_use": [".parquet"],
-        "number of workers": 1,
-        "worker options": {
-            "num_cpus": 0.8,
-            "max_restarts": -1
-        },
-        "actor creation delay": 0
+        "num_processors": 0
     },
     "execution_stats": {
-        "cpus": 16,
+        "cpus": 4.5,
         "gpus": 0,
-        "memory": 14.396823502145708,
-        "object_store": 2.0,
-        "execution time, min": 0.22008283535639445
+        "memory": 15.91,
+        "object_store": 0,
+        "execution time, min": 0.0
     },
     "job_output_stats": {
-        "number of buckets": 15,
-        "number of docs": 3,
-        "number of removed docs": 2,
-        "number of min hashes": 5,
-        "overall hash memory GB": 7.152557373046875e-6,
-        "de duplication %": 40.0,
-        "source_files": 2,
-        "source_size": 73126,
-        "generated buckets": 15,
-        "generated minhashes": 5,
-        "source_doc_count": 10,
-        "generated doc_ids": 3,
-        "bucket processing time": 0.04204988479614258,
         "result_files": 1,
-        "result_size": 36941,
-        "processing_time": 2.286285161972046,
-        "source_documents": 5,
-        "result_documents": 3,
-        "result_doc_count": 3
+        "result_size": 663,
+        "processing_time": 0.024,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
     },
     "source": {
-        "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/test-data/input",
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
         "type": "path"
     },
     "target": {
-        "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/output",
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
         "type": "path"
     }
 }
diff --git a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet b/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet
deleted file mode 100644
index 92b4e58c7..000000000
Binary files a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet and /dev/null differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
new file mode 100644
index 000000000..c7d3d8072
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
new file mode 100644
index 000000000..c355b299a
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
new file mode 100644
index 000000000..ad59ee31c
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
new file mode 100644
index 000000000..fb2a0b13d
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
new file mode 100644
index 000000000..aca2026d8
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
new file mode 100644
index 000000000..1a46cb40f
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
new file mode 100644
index 000000000..56934cab8
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
new file mode 100644
index 000000000..f82d9daca
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
new file mode 100644
index 000000000..842ce2caa
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
new file mode 100644
index 000000000..fcb03c17a
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
new file mode 100644
index 000000000..84c399e67
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
new file mode 100644
index 000000000..79a6f24b3
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
new file mode 100644
index 000000000..e67164596
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
new file mode 100644
index 000000000..cd2e75eaa
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
new file mode 100644
index 000000000..5212dff6d
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
new file mode 100644
index 000000000..d0f1bd9b4
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
new file mode 100644
index 000000000..1cc7b2c26
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
new file mode 100644
index 000000000..f892d384d
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
new file mode 100644
index 000000000..1a786300b
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
new file mode 100644
index 000000000..bc20a7699
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
new file mode 100644
index 000000000..151008dc4
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
new file mode 100644
index 000000000..b485d3882
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
new file mode 100644
index 000000000..0da33db3c
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
new file mode 100644
index 000000000..1e1b4765c
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
new file mode 100644
index 000000000..7e9af93b0
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
new file mode 100644
index 000000000..d112e179e
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
new file mode 100644
index 000000000..f3f7d2a7d
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
new file mode 100644
index 000000000..06444accf
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json
new file mode 100644
index 000000000..f7f0fe9df
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:37",
+        "end_time": "2024-10-14 10:43:38",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 31.7,
+        "gpus": 0,
+        "memory": 15.83,
+        "object_store": 0,
+        "execution time, min": 0.003
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.2,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0
deleted file mode 100644
index c92d73bfb..000000000
Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 and /dev/null differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0
deleted file mode 100644
index c3966bec2..000000000
Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 and /dev/null differ
diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0
deleted file mode 100644
index e419c9516..000000000
Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 and /dev/null differ
diff --git a/transforms/universal/fdedup/ray/test-data/input/df1.parquet b/transforms/universal/fdedup/ray/test-data/input/df1.parquet
new file mode 100644
index 000000000..2584725bb
Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/input/df1.parquet differ
diff --git a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet b/transforms/universal/fdedup/ray/test-data/input/sample1.parquet
deleted file mode 100644
index 58387d07d..000000000
Binary files a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet and /dev/null differ
diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
new file mode 100644
index 000000000..a3771fbd8
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
@@ -0,0 +1,52 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from cluster_analysis_transform import (
+    jaccard_similarity_threshold_cli_param,
+    num_bands_cli_param,
+    num_segments_cli_param,
+    sort_output_cli_param,
+)
+from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_ray.runtime.ray import RayTransformLauncher
+
+
+class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "run_locally": True,
+            num_bands_cli_param: 14,
+            num_segments_cli_param: 2,
+            jaccard_similarity_threshold_cli_param: 0.7,
+            sort_output_cli_param: True,
+        }
+        launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "signature_calc", "bands"),
+                os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
new file mode 100644
index 000000000..a62105b2c
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
@@ -0,0 +1,61 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+    operation_mode_cli_param,
+)
+from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_ray.runtime.ray import RayTransformLauncher
+
+
+class TestRayDataCleaningTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        duplicate_location = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "test-data",
+                "expected",
+                "get_list_transform",
+                "docs_to_remove_consolidated",
+                "docs_to_remove_consolidated.parquet",
+            )
+        )
+        config = {
+            "run_locally": True,
+            document_id_column_cli_param: "int_id_column",
+            duplicate_list_location_cli_param: duplicate_location,
+            operation_mode_cli_param: "annotate",
+        }
+        launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "input"),
+                os.path.join(basedir, "expected", "data_cleaning", "annotated"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_fdedup.py b/transforms/universal/fdedup/ray/test/test_fdedup.py
deleted file mode 100644
index fa46fb071..000000000
--- a/transforms/universal/fdedup/ray/test/test_fdedup.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-# There is no local test for fdedup
-# This is just a place holder t satisfy overall framework
-
-
-def test_fdedup():
-    pass
diff --git a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py b/transforms/universal/fdedup/ray/test/test_fdedup_ray.py
deleted file mode 100644
index 78ee7cc04..000000000
--- a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import os
-
-from data_processing.test_support.launch.transform_test import (
-    AbstractTransformLauncherTest,
-)
-from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_ray import FdedupRayTransformConfiguration
-
-
-class TestRayFdedupTransform(AbstractTransformLauncherTest):
-    """
-    Extends the super-class to define the test data for the tests defined there.
-    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
-    """
-
-    def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
-        config = {
-            "run_locally": True,
-            # When running in ray, our Runtime's get_transform_config() method  will load the domains using
-            # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration.
-            # columns used
-            "fdedup_doc_column": "contents",
-            "fdedup_id_column": "int_id_column",
-            "fdedup_cluster_column": "cluster",
-            # infrastructure
-            "fdedup_bucket_cpu": 0.5,
-            "fdedup_doc_cpu": 0.5,
-            "fdedup_mhash_cpu": 0.5,
-            "fdedup_num_doc_actors": 1,
-            "fdedup_num_bucket_actors": 1,
-            "fdedup_num_minhash_actors": 1,
-            "fdedup_num_preprocessors": 1,
-            # fuzzy parameters
-            "fdedup_num_permutations": 64,
-            "fdedup_threshold": 0.8,
-            "fdedup_shingles_size": 5,
-            "fdedup_delimiters": " ",
-            # Random delay between reads
-            "fdedup_random_delay_limit": 5,
-            # snapshotting
-            "fdedup_snapshot_delay": 1,
-            "fdedup_use_doc_snapshot": False,
-            "fdedup_use_bucket_snapshot": False,
-        }
-        launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
-        fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")]
-        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
new file mode 100644
index 000000000..55869598c
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
@@ -0,0 +1,44 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from get_duplicate_list_transform import sort_output_cli_param
+from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration
+
+
+class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "run_locally": True,
+            sort_output_cli_param: True,
+        }
+        launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "cluster_analysis"),
+                os.path.join(basedir, "expected", "get_list_transform"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py
new file mode 100644
index 000000000..34f3ee403
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from signature_calc_transform import (
+    num_bands_cli_param,
+    num_permutations_cli_param,
+    num_segments_cli_param,
+)
+from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+
+
+class TestRaySignatureCalcTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "run_locally": True,
+            num_permutations_cli_param: 112,
+            num_bands_cli_param: 14,
+            num_segments_cli_param: 2,
+        }
+        launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration())
+        fixtures = [
+            (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc"))
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile
new file mode 100644
index 000000000..b04994d46
--- /dev/null
+++ b/transforms/universal/fdedup/spark/Dockerfile
@@ -0,0 +1,51 @@
+ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0
+FROM ${BASE_IMAGE}
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
+
+WORKDIR ${SPARK_HOME}/work-dir
+
+# Copy in the data processing framework source/project and install it
+# This is expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=spark:root data-processing-dist data-processing-dist
+RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]
+
+## Copy the python version of the tansform
+COPY --chown=spark:root python-transform/  python-transform/
+RUN cd python-transform && pip install --no-cache-dir -e .
+
+# Install spark project source
+COPY --chown=spark:root src/ src/
+COPY --chown=spark:root pyproject.toml pyproject.toml
+COPY --chown=spark:root README.md README.md
+RUN mkdir -p /opt/spark/work-dir/src/templates && \
+    mkdir -p /opt/spark/work-dir/config
+COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
+COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
+
+# install requirements from requirements.txt
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+
+RUN pip install --no-cache-dir -e .
+
+# copy the main() entry point to the image
+COPY ./src/fdedup_transform_spark.py .
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+USER spark
+
+# Set environment
+ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH}
+ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH}
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile
new file mode 100644
index 000000000..ac2735e7d
--- /dev/null
+++ b/transforms/universal/fdedup/spark/Makefile
@@ -0,0 +1,57 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+
+# Set this, before including .make.defaults, to 
+#   1 if requirements reference the latest code in the data processing library 
+#     in this repo (that is not yet published to pypi).	 This is the default setting.
+#   0 if the transforms DPK dependencies are on wheels published to 
+#     pypi (e.g. data-prep-toolkit=0.2.1)
+#USE_REPO_LIB_SRC=1
+
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+include $(REPOROOT)/transforms/.make.transforms
+
+# Include the common configuration for this transform
+include ../transform.config
+
+venv::	.transforms.spark-venv
+
+test::	.transforms.spark-test
+
+clean:: .transforms.clean
+
+image:: .transforms.spark-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-spark
+
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions
+        
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.spark-test-image
+
+run-cli-sample: .transforms.run-cli-spark-sample
+
+run-local-sample: .transforms.run-local-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md
new file mode 100644
index 000000000..3bf9b3245
--- /dev/null
+++ b/transforms/universal/fdedup/spark/README.md
@@ -0,0 +1,109 @@
+# Spark-GUF
+
+This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster.  
+
+The modules can run locally or remotely in a Kubernetes cluster.
+
+## Running Transforms locally
+
+Start in the `spark-guf` directory. To run the modules locally, follow these steps:
+1. Create a virtual environment using this command
+   ```
+   make venv
+   ```
+2. Activate the virtual environment:
+   ```
+   source venv/bin/activate
+   ```
+
+3. Set the `PYTHONPATH` environment variable to include the `src` directory:
+   ```
+   export PYTHONPATH=${PYTHONPATH}:${PWD}/src
+   ```
+4. Invoke one of the transforms:
+   ```
+   python src/transforms/spark_pi/spark_transformer_pi.py
+   ```
+5. To find out which arguments a transform takes, run that transform with a `--help` flag:
+   ```
+   python src/transforms/spark_filter/spark_filter_transform.py --help
+   usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE]
+                                    --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP]
+                                    [--filter_logical_operator {AND,OR}]
+
+   optional arguments:
+      -h, --help            show this help message and exit
+      --input_folder INPUT_FOLDER
+                            path to read the input files (local fs or s3)
+      --output_folder OUTPUT_FOLDER
+                            path to write the output files (local fs or s3)
+      --data_type DATA_TYPE
+                            Type of files to filter (parquet, orc, csv, json, txt)
+      --filter_criteria_list FILTER_CRITERIA_LIST
+                            list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04'
+                            AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ]
+      --filter_columns_to_drop FILTER_COLUMNS_TO_DROP
+                            list of columns to drop after filtering, for example: ["column1", "column2"]
+      --filter_logical_operator {AND,OR}
+                            logical operator (AND or OR) that joins filter criteria
+   ```
+
+## Running Transforms in Kubernetes/OpenShift
+
+Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps:
+
+1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io:
+   ```
+   docker build -t my-docker-username/my-pyspark:3.5.1 .
+   docker push my-docker-username/my-pyspark:3.5.1
+   ```  
+2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image):
+   ```
+   docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 .
+   docker push my-docker-username/my-pyspark-filter:3.5.1 
+   ```
+
+3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps:
+   ```
+   # create 'spark' service account
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace
+
+   # create 'spark' role
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace
+
+   # bind the 'spark' service account to the 'spark' role
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace
+
+   # bind the 'spark' service account to the cluster roles
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace
+   ```
+   
+ 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets.
+ 
+ 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory.  The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml):
+    1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`).
+    2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`)
+    3. Specify the command to launch the Spark application (in `spec/containers/args`)
+    4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2).
+    5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`)
+    6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`)
+    7. Configure S3: 
+       1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs.  
+       2. Specify the input and out access key ids and secret access keys.
+
+6. Launch the Spark application by creating the driver pod:
+   ```
+   kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml
+   ```
+   
+7. Monitor the creation of the executor pods:
+   ```
+   kubectl get pods -w
+   ```
+
+8. Monitor the driver logs:
+   ```
+   kubectl logs spark-driver-pod-filter -f
+   ```
+   ```
diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml
new file mode 100644
index 000000000..d9579e0c7
--- /dev/null
+++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Pod
+metadata:
+spec:
+    imagePullSecrets:
+        - name: prod-all-icr-io
+    securityContext:
+        fsGroup: 0
diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml
new file mode 100644
index 000000000..eeddbd694
--- /dev/null
+++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml
@@ -0,0 +1,14 @@
+spark.app.name: ${APP_NAME}
+spark.driver.memory: ${DRIVER_MEMORY}
+spark.executor.instances: ${NUM_EXECUTORS}
+spark.executor.memory: ${EXECUTOR_MEMORY}
+spark.executor.cores: ${EXECUTOR_CORES}
+spark.sql.shuffle.partitions: ${NUM_TASKS}
+spark.task.cpus: ${TASK_CPUS}
+spark.sql.legacy.parquet.nanosAsLong: true
+spark.executor.decommission.forceKillTimeout: "10h"
+# spark.sql.files.ignoreCorruptFiles: true
+# configuration needed when running in kubernetes
+spark.kubernetes.authenticate.driver.serviceAccountName: ${SERVICE_ACCOUNT}
+spark.kubernetes.container.image: ${EXECUTOR_DOCKER_IMAGE}
+spark.kubernetes.namespace: ${EXECUTOR_NAMESPACE}
diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml
new file mode 100644
index 000000000..f77df2010
--- /dev/null
+++ b/transforms/universal/fdedup/spark/pyproject.toml
@@ -0,0 +1,45 @@
+[project]
+name = "dpk_fdedup_transform_spark"
+version = "0.2.2.dev2"
+requires-python = ">=3.10,<3.13"
+description = "Fuzzy Dedup Spark Transform"
+license = {text = "Apache-2.0"}
+readme = {file = "README.md", content-type = "text/markdown"}
+authors = [
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
+]
+dynamic = ["dependencies"]
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src","test"]
+
+[options.packages.find]
+where = ["src/"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]
diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt
new file mode 100644
index 000000000..c373ffbb7
--- /dev/null
+++ b/transforms/universal/fdedup/spark/requirements.txt
@@ -0,0 +1,11 @@
+dpk_fdedup_transform_python==0.2.2.dev2
+data-prep-toolkit[spark]==0.2.2.dev2
+pyyaml>=6.0.2
+boto3>=1.34.69
+kubernetes>=30.1.0
+polars==1.9.0
+disjoint-set>=0.8.0
+numpy<1.29.0
+sentencepiece>=0.2.0
+mmh3>=4.1.0
+scipy>=1.12.0, <2.0.0
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
new file mode 100644
index 000000000..c9950657c
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
@@ -0,0 +1,49 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import polars as pl
+from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+# create parameters
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "cluster_num_bands": 14,
+    "cluster_num_segments": 2,
+    "cluster_jaccard_similarity_threshold": 0.7,
+}
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
new file mode 100644
index 000000000..feeb3241e
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
@@ -0,0 +1,75 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+from cluster_analysis_transform import (
+    ClusterAnalysisTransformConfiguration,
+    num_bands_key,
+    num_segments_key,
+)
+from data_processing.data_access import DataAccess
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    DefaultSparkTransformRuntime,
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
+
+
+logger = get_logger(__name__)
+
+
+class ClusterAnalysisSparkRuntime(DefaultSparkTransformRuntime):
+    """
+    Cluster analysis runtime support for Spark
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        bands = self.params[num_bands_key]
+        segments = self.params[num_segments_key]
+        folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
+        return folders
+
+
+class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis
+    as required by the SparkTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(
+            transform_config=ClusterAnalysisTransformConfiguration(),
+            runtime_class=ClusterAnalysisSparkRuntime,
+        )
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+    logger.info("Launching fuzzy dedup cluster analysis spark transform")
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
new file mode 100644
index 000000000..eb1e61845
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
@@ -0,0 +1,61 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import polars as pl
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+duplicate_location = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "test-data",
+        "expected",
+        "docs_to_remove_consolidated",
+        "docs_to_remove_consolidated.parquet",
+    )
+)
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    document_id_column_cli_param: "int_id_column",
+    duplicate_list_location_cli_param: duplicate_location,
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
new file mode 100644
index 000000000..2ff0df8bf
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
@@ -0,0 +1,124 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+from data_cleaning_transform import (
+    DataCleaningTransformConfiguration,
+    dataclean_data_access_key,
+    dataclean_data_factory_key,
+    duplicate_list_location_default,
+    duplicate_list_location_key,
+)
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import TransformStatistics
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    DefaultSparkTransformRuntime,
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
+
+
+logger = get_logger(__name__)
+
+
+class DataCleaningSparkRuntime(DefaultSparkTransformRuntime):
+    """
+    Data cleaning runtime support for Spark
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_transform_config(
+        self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
+    ) -> dict[str, Any]:
+        """
+        Download the table of duplicate document ids that will be provided to the
+        filtering/annotation method. This is the opportunity for this runtime to
+        create a new set of configuration based on the config/params provided to
+        this instance's initializer. This may include the addition of new
+        configuration data such as ray shared memory, new actors, etc., that
+        might be needed and expected by the transform in its initializer and/or
+        transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :param statistics - reference to statistics actor
+        :param files - list of files to process
+        :return: dictionary of transform init params
+        """
+        data_access = data_access_factory.create_data_access()
+        dc_data_access = self.params.get(dataclean_data_access_key, None)
+        if dc_data_access is None:
+            dc_daf = self.params.get(dataclean_data_factory_key, None)
+            if dc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}")
+            dc_data_access = dc_daf.create_data_access()
+        if dc_data_access.output_folder is None:
+            dc_data_access.output_folder = data_access.output_folder
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        if not duplicate_list_location.startswith("/"):
+            out_paths = dc_data_access.output_folder.rstrip("/").split("/")
+            dupl_list_paths = duplicate_list_location.split("/")
+            paths = out_paths[:-1] + dupl_list_paths
+            duplicate_list_location = "/".join([p.strip("/") for p in paths])
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location)
+        return self.params | {"df": self.duplicate_list}
+
+
+class DataCleaningSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for Fuzzy Dedup Data Cleaning
+    as required by the SparkTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(
+            transform_config=DataCleaningTransformConfiguration(),
+            runtime_class=DataCleaningSparkRuntime,
+        )
+
+    def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
+        """
+        Download the table of duplicate document ids that will be provided to the
+        filtering/annotation method. This is the opportunity for this runtime to
+        create a new set of configuration based on the config/params provided to
+        this instance's initializer. This may include the addition of new
+        configuration data such as ray shared memory, new actors, etc., that
+        might be needed and expected by the transform in its initializer and/or
+        transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :return: dictionary of parameters to be broadcast
+        """
+        data_access = data_access_factory.create_data_access()
+        duplicate_list_location = os.path.abspath(
+            os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"])
+        )
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        return {"df": self.duplicate_list}
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+    logger.info("Launching fuzzy dedup data cleaning transform")
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py
new file mode 100644
index 000000000..82767f849
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py
@@ -0,0 +1,62 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import argparse
+import os
+import sys
+
+from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from fdedup_transform_python import ServiceOrchestrator, parse_args
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+from signature_calc_transform_spark import (
+    SignatureCalculationSparkTransformConfiguration,
+)
+
+
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
+
+
+class SparkServiceOrchestrator(ServiceOrchestrator):
+    def __init__(self, global_params: argparse.Namespace = None):
+        super().__init__(global_params=global_params)
+
+    def execute_service(self, service_short_name: str, params: list) -> int:
+        sys.argv = params
+        if service_short_name == "minhash":
+            launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+        elif service_short_name == "cluster":
+            launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+        elif service_short_name == "fdlist":
+            launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+        elif service_short_name == "fdclean":
+            launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+        status = launcher.launch()
+        return status
+
+
+if __name__ == "__main__":
+
+    # Parse command line arguments
+    args = parse_args()
+    # Initialize the orchestrator
+    orchestrator = SparkServiceOrchestrator(global_params=args)
+    # Launch spark fuzzy dedup execution
+    orchestrator.orchestrate()
diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/src/requirements.txt
new file mode 100644
index 000000000..c1a1f2c3d
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/requirements.txt
@@ -0,0 +1,8 @@
+pyspark
+pyarrow
+pyyaml
+boto3
+kubernetes
+disjoint_set
+mmh3
+scipy
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
new file mode 100644
index 000000000..2db884346
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
@@ -0,0 +1,50 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import polars as pl
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from signature_calc_transform_spark import (
+    SignatureCalculationSparkTransformConfiguration,
+)
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {"input_folder": input_folder, "output_folder": output_folder}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    "scdata_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "minhash_num_permutations": 112,
+    "minhash_num_bands": 14,
+    "minhash_num_segments": 2,
+}
+
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py
new file mode 100644
index 000000000..4e39810c6
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
+from signature_calc_transform import SignatureCalculationTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class SignatureCalculationSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for Fuzzy Dedup Signature Calculation
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=SignatureCalculationTransformConfiguration())
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+    logger.info("Launching fuzzy dedup signature calculation transform")
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
new file mode 100644
index 000000000..79fe53b62
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
new file mode 100644
index 000000000..9df2f3bd5
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
new file mode 100644
index 000000000..f5da05a10
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
new file mode 100644
index 000000000..0e089dee3
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
new file mode 100644
index 000000000..4b0fecb15
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
new file mode 100644
index 000000000..5601f5cb0
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
new file mode 100644
index 000000000..02bedff1c
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
new file mode 100644
index 000000000..bf131f43c
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
new file mode 100644
index 000000000..d41b35de2
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
new file mode 100644
index 000000000..06b4b7467
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
new file mode 100644
index 000000000..ca5323db5
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
new file mode 100644
index 000000000..2838dd972
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
new file mode 100644
index 000000000..7cb2cbac4
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
new file mode 100644
index 000000000..79fe53b62
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
new file mode 100644
index 000000000..57642d199
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
new file mode 100644
index 000000000..9de625746
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
new file mode 100644
index 000000000..9df2f3bd5
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
new file mode 100644
index 000000000..8e1fe121e
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
new file mode 100644
index 000000000..37aea5168
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
new file mode 100644
index 000000000..3d1f158e9
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
new file mode 100644
index 000000000..ca5323db5
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
new file mode 100644
index 000000000..06b4b7467
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
new file mode 100644
index 000000000..c08326355
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -0,0 +1,58 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "cluster",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:32:15",
+        "end_time": "2024-10-18 10:32:15",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "jaccard_similarity_threshold": 0.7,
+        "num_bands": 14,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 91.7,
+        "gpus": 0,
+        "memory": 24.01,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 28,
+        "result_size": 38040,
+        "processing_time": 0.061,
+        "input_files": 28,
+        "input_bytes": 115324,
+        "input_rows": 168,
+        "consolidated_files": 28,
+        "consolidated_bytes": 80640,
+        "consolidated_rows": 168,
+        "groupby_clusters": 35,
+        "cluster_duplicate_docs": 79,
+        "jaccard_clusters": 35,
+        "jaccard_duplicate_docs": 44,
+        "num_duplicate_documents": 44
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet
new file mode 100644
index 000000000..03a0c321a
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json
new file mode 100644
index 000000000..047921334
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json
@@ -0,0 +1,56 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "spark",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:38",
+        "end_time": "2024-10-14 10:43:55",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "annotate",
+        "RDD parallelization": -1,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"]
+    },
+    "execution_stats": {
+        "num partitions": 20,
+        "execution time, min": 0.284,
+        "cpus": 20,
+        "gpus": 0,
+        "memory": 0.36,
+        "object_store": 0
+    },
+    "job_output_stats": {
+        "source_size": 4111,
+        "output_bytes": 8856,
+        "processing_time": 0.46729254722595215,
+        "input_bytes": 8753,
+        "result_size": 6923,
+        "input_files": 1,
+        "source_files": 1,
+        "input_docs": 12,
+        "output_docs": 12,
+        "filtered_docs": 0,
+        "output_files": 1,
+        "result_files": 1,
+        "source_doc_count": 12,
+        "filtered_bytes": -103,
+        "result_doc_count": 12
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
new file mode 100644
index 000000000..d67b5bcf8
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
new file mode 100644
index 000000000..267e78385
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json
new file mode 100644
index 000000000..717d9bbe9
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:10:22",
+        "end_time": "2024-10-18 10:10:23",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 112.7,
+        "gpus": 0,
+        "memory": 24.17,
+        "object_store": 0,
+        "execution time, min": 0.005
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.308,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 000000000..8aa870c00
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 000000000..34b15a76c
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json
new file mode 100644
index 000000000..d4cd3e362
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:49:10",
+        "end_time": "2024-10-18 10:49:10",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 101.1,
+        "gpus": 0,
+        "memory": 24.02,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.007,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/metadata.json
new file mode 100644
index 000000000..a0b26f931
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/metadata.json
@@ -0,0 +1,49 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 11:36:37",
+        "end_time": "2024-10-18 11:36:37",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "sort_output": false,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 4.5,
+        "gpus": 0,
+        "memory": 15.91,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.024,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
new file mode 100644
index 000000000..c7d3d8072
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
new file mode 100644
index 000000000..c355b299a
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
new file mode 100644
index 000000000..ad59ee31c
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
new file mode 100644
index 000000000..fb2a0b13d
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
new file mode 100644
index 000000000..aca2026d8
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
new file mode 100644
index 000000000..1a46cb40f
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
new file mode 100644
index 000000000..56934cab8
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
new file mode 100644
index 000000000..f82d9daca
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
new file mode 100644
index 000000000..842ce2caa
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
new file mode 100644
index 000000000..fcb03c17a
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
new file mode 100644
index 000000000..84c399e67
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
new file mode 100644
index 000000000..79a6f24b3
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
new file mode 100644
index 000000000..e67164596
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
new file mode 100644
index 000000000..cd2e75eaa
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
new file mode 100644
index 000000000..5212dff6d
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
new file mode 100644
index 000000000..d0f1bd9b4
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
new file mode 100644
index 000000000..1cc7b2c26
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
new file mode 100644
index 000000000..f892d384d
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
new file mode 100644
index 000000000..1a786300b
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
new file mode 100644
index 000000000..bc20a7699
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
new file mode 100644
index 000000000..151008dc4
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
new file mode 100644
index 000000000..b485d3882
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
new file mode 100644
index 000000000..0da33db3c
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
new file mode 100644
index 000000000..1e1b4765c
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
new file mode 100644
index 000000000..7e9af93b0
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
new file mode 100644
index 000000000..d112e179e
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
new file mode 100644
index 000000000..f3f7d2a7d
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
new file mode 100644
index 000000000..06444accf
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json
new file mode 100644
index 000000000..f7f0fe9df
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:37",
+        "end_time": "2024-10-14 10:43:38",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 31.7,
+        "gpus": 0,
+        "memory": 15.83,
+        "object_store": 0,
+        "execution time, min": 0.003
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.2,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/input/df1.parquet b/transforms/universal/fdedup/spark/test-data/input/df1.parquet
new file mode 100644
index 000000000..2584725bb
Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/input/df1.parquet differ
diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
new file mode 100644
index 000000000..294c86f25
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from cluster_analysis_transform import sort_output_cli_param
+from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "cluster_num_bands": 14,
+            "cluster_num_segments": 2,
+            "cluster_jaccard_similarity_threshold": 0.7,
+            sort_output_cli_param: True,
+        }
+        launcher = SparkTransformLauncher(ClusterAnalysisSparkTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "signature_calc", "bands"),
+                os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
new file mode 100644
index 000000000..919857e23
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
@@ -0,0 +1,58 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+    operation_mode_cli_param,
+)
+from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+class TestSparkDataCleaningTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        duplicate_location = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "test-data",
+                "expected/get_list_transform/docs_to_remove_consolidated",
+                "docs_to_remove_consolidated.parquet",
+            )
+        )
+        config = {
+            document_id_column_cli_param: "int_id_column",
+            duplicate_list_location_cli_param: duplicate_location,
+            operation_mode_cli_param: "annotate",
+        }
+        launcher = SparkTransformLauncher(DataCleaningSparkTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "input"),
+                os.path.join(basedir, "expected", "data_cleaning", "annotated"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
new file mode 100644
index 000000000..4b59e3a7a
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
@@ -0,0 +1,45 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from get_duplicate_list_transform import sort_output_cli_param
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            sort_output_cli_param: True,
+        }
+        launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "cluster_analysis"),
+                os.path.join(basedir, "expected", "get_list_transform"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py
new file mode 100644
index 000000000..6d93dc7a9
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from signature_calc_transform_spark import (
+    SignatureCalculationSparkTransformConfiguration,
+)
+
+
+class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "minhash_num_permutations": 112,
+            "minhash_num_bands": 14,
+            "minhash_num_segments": 2,
+        }
+        launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration())
+        fixtures = [
+            (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc"))
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config
index 774716e15..ffaeb9f45 100644
--- a/transforms/universal/fdedup/transform.config
+++ b/transforms/universal/fdedup/transform.config
@@ -14,5 +14,6 @@ TRANSFORM_NAME=fdedup
 #
 # If you change the versions numbers, be sure to run "make set-versions" to 
 # update version numbers across the transform (e.g., pyproject.toml).
-FDEDUP_RAY_VERSION=$(DPK_VERSION)
-
+FDEDUP_PYTHON_VERSION=$(DPK_VERSION)
+FDEDUP_RAY_VERSION=$(FDEDUP_PYTHON_VERSION)
+FDEDUP_SPARK_VERSION=$(FDEDUP_PYTHON_VERSION)
diff --git a/transforms/universal/fdedup/utils/Makefile.local b/transforms/universal/fdedup/utils/Makefile.local
new file mode 100644
index 000000000..d9dae01d7
--- /dev/null
+++ b/transforms/universal/fdedup/utils/Makefile.local
@@ -0,0 +1,18 @@
+PYTHON=python
+PIP=pip
+
+venv:	requirements.txt
+	$(PYTHON) -m venv venv
+	if [ -e venv/Scripts/activate ]; then			\
+		echo "For Windows please try the following AS Administrator - no guarantees";	\
+		echo "  venv\\Scripts\\activate";		\
+		echo "  pip install --upgrade pip";		\
+		echo "  pip install -r requirements.txt";	\
+		echo "  pip install pytest";		\
+	else						\
+		. venv/bin/activate;			\
+		$(PIP) install --upgrade pip;		\
+		$(PIP) install -r requirements.txt;	\
+	fi					
+set-versions:
+	@:
\ No newline at end of file
diff --git a/transforms/universal/fdedup/utils/calc_r_and_b.ipynb b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb
new file mode 100644
index 000000000..8398f9efa
--- /dev/null
+++ b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb
@@ -0,0 +1,74 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf5dba9a-d530-4a0a-ae71-2d741f7e705f",
+   "metadata": {},
+   "source": [
+    "This notebook allows calculating the values for `b` (the number of bands) and `r` (the number of minhashes in a band) used in the fuzzy dedup algorithm. The default values are `b=14` and `r=8`, as defined in the [FineWeb datasets paper](https://arxiv.org/pdf/2406.17557). The x-axis of the graph represents the Jaccard similarity between a pair of documents, while the y-axis represents the probability that they become duplication candidates. Please refer to http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf for more details on this methodology."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "800bc113-8b5e-4cec-8717-98fa05753bd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Define the parameterized function\n",
+    "def f(s, r, b):\n",
+    "    return 1 - (1 - s**r)**b\n",
+    "\n",
+    "# Set the parameters r and b\n",
+    "r = 8\n",
+    "b = 14\n",
+    "\n",
+    "# Generate values for s in a range, e.g., from 0 to 1\n",
+    "s_values = np.linspace(0, 1, 500)  # 500 points between 0 and 1\n",
+    "f_values = f(s_values, r, b)\n",
+    "\n",
+    "# Plot the function\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "plt.plot(s_values, f_values, label=fr\"$f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\", color='blue')\n",
+    "plt.xlabel(\"s\")\n",
+    "plt.ylabel(\"f(s)\")\n",
+    "plt.title(f\"Plot of the function $f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\")\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98016b04-b6a0-465d-b65b-6d402978c9f0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/fdedup/utils/requirements.txt b/transforms/universal/fdedup/utils/requirements.txt
new file mode 100644
index 000000000..ce2acfefb
--- /dev/null
+++ b/transforms/universal/fdedup/utils/requirements.txt
@@ -0,0 +1,3 @@
+jupyter
+numpy
+matplotlib