diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile index d4769187b..5fde2bb07 100644 --- a/data-processing-lib/spark/Makefile +++ b/data-processing-lib/spark/Makefile @@ -11,9 +11,14 @@ setup:: set-versions: .check-env $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml - sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/' \ - pyproject.toml > tt.toml - mv tt.toml pyproject.toml + if [ -e pyproject.toml ]; then \ + cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \ + mv tt.toml pyproject.toml; \ + fi + if [ -e requirements.txt ]; then \ + cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \ + mv tt.txt requirements.txt; \ + fi build:: build-dist @@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist publish-image:: .defaults.publish-image -venv:: pyproject.toml +venv:: $(MAKE) .defaults.spark-lib-src-venv pip install pytest pytest-cov diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh index cb7b1ee10..7054a7b9a 100755 --- a/scripts/check-workflows.sh +++ b/scripts/check-workflows.sh @@ -17,7 +17,7 @@ if [ ! -d transforms ]; then echo Please run this script from the top of the repository exit 1 fi -KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser" +KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup" while [ $# -ne 0 ]; do case $1 in -show-kfp-black-list) echo $KFP_BLACK_LIST; exit 0; diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 3156ab6f1..683f93210 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -14,14 +14,24 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from src.fdedup_compute_execution_params import fdedup_compute_execution_params +from src.fdedup_compute_execution_params import ( + cluster_analysis_compute_execution_params, + compute_common_params, + data_cleaning_compute_execution_params, + get_duplicate_list_compute_execution_params, + signature_calc_compute_execution_params, +) from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils -task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest" +task_image = os.getenv("FDEDUP_IMAGE_LOCATION", "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest") +image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret") # the name of the job script -EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" +SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py" +CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py" +GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py" +DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" @@ -40,8 +50,18 @@ # compilation time. import uuid - compute_exec_params_op = dsl.component_decorator.component( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = dsl.component_decorator.component(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = dsl.component_decorator.component( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = dsl.component_decorator.component( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = dsl.component_decorator.component( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = dsl.component_decorator.component( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) print( "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " @@ -49,61 +69,95 @@ ) run_id = uuid.uuid4().hex else: - compute_exec_params_op = comp.create_component_from_func( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = comp.create_component_from_func( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = comp.create_component_from_func( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = comp.create_component_from_func( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = comp.create_component_from_func( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute signature calculation job +execute_signature_calc_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) +# execute cluster analysis job +execute_cluster_analysis_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute get duplicate list job +execute_get_duplicate_list_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute data cleaning job +execute_data_cleaning_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "fdedup" +TASK_NAME: str = "fuzzydedup" @dsl.pipeline( name=TASK_NAME + "-ray-pipeline", - description="Pipeline for fdedup", + description="Pipeline for fuzzy dedup", ) -def fdedup( +def fuzzydedup( + # folders used # Ray cluster - ray_name: str = "fdedup-kfp-ray", # name of Ray cluster + ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed - ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}", - data_s3_access_secret: str = "s3-secret", + data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}", + data_s3_access_secret: str = "s3-south-secret", + scdata_s3_access_secret: str = "s3-south-secret", + dcdata_s3_access_secret: str = "s3-south-secret", data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # columns used - fdedup_doc_column: str = "contents", - fdedup_id_column: str = "int_id_column", - fdedup_cluster_column: str = "cluster", - # infrastructure - fdedup_bucket_cpu: float = 0.5, - fdedup_doc_cpu: float = 0.5, - fdedup_mhash_cpu: float = 0.5, + fdedup_contents_column: str = "contents", + fdedup_document_id_column: str = "int_id_column", # fuzzy parameters - fdedup_num_permutations: int = 64, - fdedup_threshold: float = 0.8, - fdedup_shingles_size: int = 5, - fdedup_delimiters: str = " ", - # Random delay between reads - fdedup_random_delay_limit: int = 5, - # snapshotting - fdedup_snapshot_delay: int = 1, - fdedup_use_doc_snapshot: bool = False, - fdedup_use_bucket_snapshot: bool = False, + fdedup_num_permutations: int = 112, + fdedup_num_bands: int = 14, + fdedup_num_minhashes_per_band: int = 8, + fdedup_word_shingle_size: int = 5, + fdedup_shingle_option: str = "word", + fdedup_jaccard_similarity_threshold: float = 0.75, + fdedup_seed: int = 42, + fdedup_docs_to_remove_folder: str = "docs_to_remove", + fdedup_duplicate_list_location: str = os.path.join( + "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ), + fdedup_operation_mode: str = "annotate", # data sampling fdedup_n_samples: int = 10, # additional parameters @@ -136,63 +190,47 @@ def fdedup( wait_print_tmout - time between prints, sec http_retries - http retries for API server calls :param data_s3_access_secret - s3 access secret + :param scdata_s3_access_secret - signature calculation s3 access secret + :param dcdata_s3_access_secret - data cleaning s3 access secret :param data_s3_config - s3 configuration :param data_max_files - max files to process :param data_num_samples - num samples to process - :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location - :param fdedup_doc_column - document column name - :param fdedup_id_column - integer document id column name - :param fdedup_cluster_column - cluster column name - :param fdedup_bucket_cpu - number of CPUs per bucket hash - :param fdedup_doc_cpu - number of CPUs per doc hash - :param fdedup_mhash_cpu - number of CPUs per minhash hash + :param fdedup_contents_column - document column name + :param fdedup_document_id_column - integer document id column name :param fdedup_num_permutations - number of permutations - :param fdedup_threshold - threshold - :param fdedup_shingles_size - number of words in shingle - :param fdedup_delimiters - delimiter for splitting document - :param fdedup_random_delay_limit - delay between reads to reduce S3 load. - A random number between 0 and random_delay_limit is used - :param fdedup_snapshot_delay - delay between restoring individual actors - :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots - :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots + :param fdedup_num_bands - number of bands + :param fdedup_num_minhashes_per_band - length of a band + :param fdedup_word_shingle_size - length of word shingles + :param fdedup_shingle_option - type of shingle, one of 'word', or 'char' + :param fdedup_jaccard_similarity_threshold - similarity threshold + :param fdedup_seed - seed for the random number generator + :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids + :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates + :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate' :param fdedup_n_samples - number of samples for parameters computation :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): # compute execution params - compute_exec_params = compute_exec_params_op( + compute_common_exec_params = compute_common_params_op( worker_options=ray_worker_options, - actor_options=runtime_actor_options, data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=run_id, - runtime_code_location=runtime_code_location, - doc_column=fdedup_doc_column, - id_column=fdedup_id_column, - cluster_column=fdedup_cluster_column, - bucket_cpu=fdedup_bucket_cpu, - doc_cpu=fdedup_doc_cpu, - mhash_cpu=fdedup_mhash_cpu, num_permutations=fdedup_num_permutations, - threshold=fdedup_threshold, - shingles_size=fdedup_shingles_size, - delimiters=fdedup_delimiters, - random_delay_limit=fdedup_random_delay_limit, - snapshot_delay=fdedup_snapshot_delay, - use_doc_snapshot=fdedup_use_doc_snapshot, - use_bucket_snapshot=fdedup_use_bucket_snapshot, n_samples=fdedup_n_samples, ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) + fdedup_num_segments = compute_common_exec_params.outputs["num_segments"] + runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"] + runtime_num_actors = compute_common_exec_params.outputs["num_actors"] # start Ray cluster ray_cluster = create_ray_op( @@ -204,21 +242,147 @@ def fdedup( additional_params=additional_params, ) ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) + ray_cluster.after(compute_common_exec_params) + + # Get the parameters for the signature calculation job + compute_signature_calc_exec_params = compute_signature_calc_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + doc_column=fdedup_contents_column, + id_column=fdedup_document_id_column, + num_permutations=fdedup_num_permutations, + num_bands=fdedup_num_bands, + num_minhashes_per_band=fdedup_num_minhashes_per_band, + word_shingle_size=fdedup_word_shingle_size, + shingle_option=fdedup_shingle_option, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + seed=fdedup_seed, + ) + ComponentUtils.add_settings_to_component(compute_signature_calc_exec_params, ONE_HOUR_SEC * 2) + compute_signature_calc_exec_params.after(ray_cluster) + + # Execute signature calculation job + execute_signature_calc_job = execute_signature_calc_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_signature_calc_exec_params.output, + exec_script_name=SIGNATURE_CALC_EXEC_SCRIPT_NAME, + server_url=server_url, + prefix="scdata", + ) + ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" + ) + execute_signature_calc_job.after(compute_signature_calc_exec_params) + + # Get the parameters for the cluster analysis job + compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + num_bands=fdedup_num_bands, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + ) + ComponentUtils.add_settings_to_component(compute_cluster_analysis_exec_params, ONE_HOUR_SEC * 2) + compute_cluster_analysis_exec_params.after(execute_signature_calc_job) + # Execute job + execute_cluster_analysis_job = execute_cluster_analysis_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_cluster_analysis_exec_params.output, + exec_script_name=CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) + execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) + + compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + duplicate_docids_folder=fdedup_docs_to_remove_folder, + duplicate_list_location=fdedup_duplicate_list_location, + ) + ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2) + compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job) + # Execute job + execute_get_duplicate_list_job = execute_get_duplicate_list_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_get_duplicate_list_exec_params.output, + exec_script_name=GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) + execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) + + compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + id_column=fdedup_document_id_column, + duplicate_list_location=fdedup_duplicate_list_location, + operation_mode=fdedup_operation_mode, + ) + ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2) + compute_data_cleaning_exec_params.after(execute_get_duplicate_list_job) + # Execute job - execute_job = execute_ray_jobs_op( + execute_data_cleaning_job = execute_data_cleaning_job_op( ray_name=ray_name, run_id=run_id, additional_params=additional_params, - exec_params=compute_exec_params.output, - exec_script_name=EXEC_SCRIPT_NAME, + exec_params=compute_data_cleaning_exec_params.output, + exec_script_name=DATA_CLEANING_EXEC_SCRIPT_NAME, server_url=server_url, + prefix="dcdata", ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) + ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" + ) + execute_data_cleaning_job.after(compute_data_cleaning_exec_params) if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml")) + compiler.Compiler().compile(fuzzydedup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 726200339..cd3a58b99 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -10,10 +10,79 @@ # limitations under the License. ################################################################################ +from typing import Any, Dict, NamedTuple -def fdedup_compute_execution_params( + +def compute_common_params( worker_options: dict, # ray worker configuration - actor_options: dict, # actor's resource requirements + data_s3_config: str, # S3 configuration + num_permutations: int, # number of permutations (minhashes) per document + n_samples: int, # files to sample for number of documents estimation +) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]): + + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB + from runtime_utils import KFPUtils + + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling: dict[str, Any] + sampling, _ = data_access.sample_input_data(n_samples=n_samples) + number_of_docs = int(sampling.get("estimated number of docs")) + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) + print(f"Estimated number of docs: {number_of_docs}") + # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where: + # 8 bytes are taken by the band hash + # (num_permutations * 4) bytes are taken by the min hashes + # 20 bytes to provide some extra space for storage in a table + # The total amount of space needed by a band is number_of_docs * doc_bytes. + # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB + doc_bytes = 8 + num_permutations * 4 + 20 + band_bytes = number_of_docs * doc_bytes + num_segments = 1 + (band_bytes // (3 * GB)) + print(f"Number of segments: {num_segments}") + + # To process data efficiently, each actor needs 16GB of memory. + # The actor config controls CPU allocation, not memory; + # use CPU allocation s.t. the number of actors on a worker provides access to 16GB of memory for each actor. + # Also, to keep S3 utilization in check, limit the number of actors to 2000 + num_nodes = worker_options["replicas"] + cpu_per_node = worker_options["cpu"] - 1 + memory_per_node = worker_options["memory"] + + memory_per_actor = 16 # GB + max_num_actors = 2000 + num_actors_per_node: int = int(memory_per_node / memory_per_actor) + if num_actors_per_node == 0: + num_actors_per_node = 1 + # never run actors on the head node, so (n - 1) nodes to run actors + num_actors = (num_nodes - 1) * num_actors_per_node + + while num_actors > max_num_actors: + num_actors -= num_nodes - 1 + num_actors_per_node -= 1 + print(f"Number of actors per node = {num_actors_per_node}") + cpus_per_actor = cpu_per_node / num_actors_per_node + print(f"CPUs per actor = {cpus_per_actor}") + + from collections import namedtuple + + fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"]) + return fdedup_params(num_segments, num_actors, cpus_per_actor) + + +def signature_calc_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -22,27 +91,20 @@ def fdedup_compute_execution_params( runtime_code_location: dict, # code location doc_column: str, # document column name id_column: str, # integer document id column name - cluster_column: str, # cluster column name - bucket_cpu: float, # number of CPUs per bucket hash - doc_cpu: float, # number of CPUs per doc hash - mhash_cpu: float, # number of CPUs per minhash hash num_permutations: int, # number of permutations + num_bands: int, # number of bands + num_minhashes_per_band: int, # band length + word_shingle_size: int, # number of words in shingle + shingle_option: str, # type of shingle, one of 'word' or 'char' threshold: float, # threshold, - shingles_size: int, # number of words in shingle - delimiters: str, # delimiter for splitting document - random_delay_limit: int, # delay between reads to reduce S3 load. - # A random number between 0 and random_delay_limit is used - snapshot_delay: int, # delay between restoring individual actors - use_doc_snapshot: bool, # flag to skip documents building and start from existing snapshots - use_bucket_snapshot: bool, # flag to skip buckets building and start from existing snapshots - n_samples: int, # number of samples to use -) -> dict: # NamedTuple( - # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] + num_segments: int, # number of segments + seed: int, # seed for the random number generator +) -> dict: """ - Compute fuzzy dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements + Compute fuzzy dedup execution parameters for signature calculation + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -51,182 +113,202 @@ def fdedup_compute_execution_params( :param runtime_code_location: code location :param doc_column: document column name :param id_column: integer document id column name - :param cluster_column: cluster column name - :param bucket_cpu: number of CPUs per bucket hash - :param doc_cpu: number of CPUs per doc hash - :param mhash_cpu: number of CPUs per minhash hash :param num_permutations: number of permutations + :param num_bands: number of bands + :param num_minhashes_per_band: band length + :param word_shingle_size: number of words in shingle + :param shingle_option: str: type of shingle, one of 'word' or 'char' :param threshold: threshold, - :param shingles_size: number of words in shingle - :param delimiters: delimiter for splitting document - :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used - :param snapshot_delay: delay between restoring individual actors - :param use_doc_snapshot: flag to skip documents building and start from existing snapshots - :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots - :param n_samples: number of samples to use + :param num_segments: number of segments + :param seed: seed for the random number generator :return: a dictionary with a Ray Job execution parameters """ - import math - import sys - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from runtime_utils import KFPUtils - from scipy.integrate import quad as integrate - - EXECUTION_OF_KB_DOC = 0.003 - - def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, - ) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt + # fuzzy parameters for signature calculation + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + print(f"runtime_actor_options = {runtime_actor_options}") + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "minhash_contents_column": doc_column, + "minhash_document_id_column": id_column, + "minhash_num_permutations": num_permutations, + "minhash_num_bands": num_bands, + "minhash_num_minhashes_per_band": num_minhashes_per_band, + "minhash_word_shingle_size": word_shingle_size, + "minhash_shingle_option": shingle_option, + "minhash_jaccard_similarity_threshold": threshold, + "minhash_num_segments": num_segments, + "minhash_seed": seed, + "scdata_s3_config": data_s3_config, + } + + +def cluster_analysis_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + num_bands: int, # number of bands + threshold: float, # threshold, + num_segments: int, # number of segments +) -> dict: + + """ + Compute fuzzy dedup execution parameters for cluster analysis + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param num_bands: number of bands + :param threshold: threshold, + :param num_segments: number of segments + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os # fuzzy parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=threshold, - num_perm=num_permutations, - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}") # Get cluster parameters - cluster_cpu = worker_options["replicas"] * worker_options["cpu"] - cluster_memory = worker_options["replicas"] * worker_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu -= 1 - cluster_memory *= 0.85 - # get actor requirements - actor_cpu = actor_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) - if type(s3_config) is list: - # S3 config is list. take the first element - s3_config = s3_config[0] - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling, _ = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - avg_table_size = sampling.get("average table size MB") / KB - if number_of_docs == 0: - print(f"Estimated number of documents and documents size is zero. Please verify the input path.") - sys.exit(1) - # we are creating more buckets actors, so that we get better parallelization for bucket processing - b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) - d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) - m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) - # compute cpu requirements - # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount - # of CPUs - n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu - ) - if n_preprocessors <= 0: - print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") - print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") - print("Try to increase the size of the cluster") - sys.exit(1) - # compute the amount of workers - n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu) - # Ensure that we do not overwhelm S3 - if n_workers > 2000: - n_workers = 2000 - print( - f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, " - f"minhash actors {m_actors}, document actors {d_actors}" - ) - - # Make sure that we have enough memory - r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors) - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})") - sys.exit(1) + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands") + data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove") + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "cluster_num_bands": num_bands, + "cluster_jaccard_similarity_threshold": threshold, + "cluster_num_segments": num_segments, + } - print( - f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" - ) - projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 - print(f"Projected execution time {projected_execution} min") +def get_duplicate_list_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + duplicate_docids_folder: str, # folder with the docs IDs to remove + duplicate_list_location: str, # location of the list of duplicate doc ids +) -> dict: + """ + Compute fuzzy dedup execution parameters for get duplicate list step + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param duplicate_docids_folder: folder with the docs IDs to remove + :param duplicate_list_location: location of the list of duplicate doc ids + :return: a dictionary with a Ray Job execution parameters + """ + import json + + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = base_folder + data_s3_config_dict["output_folder"] = base_folder + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "fdlist_docs_to_remove": duplicate_docids_folder, + "fdlist_consolidated_filename": duplicate_list_location, + } + + +def data_cleaning_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + id_column: str, # integer document id column name + duplicate_list_location: str, # location of the list of duplicate doc ids + operation_mode: str, # filter (non-)duplicates or annotate +) -> dict: + """ + Compute fuzzy dedup execution parameters + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param id_column: integer document id column name + :param duplicate_list_location: location of the list of duplicate doc ids + :param operation_mode: filter (non-)duplicates or annotate + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os + + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder) + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, - "runtime_num_workers": n_workers, - "runtime_worker_options": str(actor_options), + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), - "fdedup_doc_column": doc_column, - "fdedup_id_column": id_column, - "fdedup_cluster_column": cluster_column, - "fdedup_bucket_cpu": bucket_cpu, - "fdedup_doc_cpu": doc_cpu, - "fdedup_mhash_cpu": mhash_cpu, - "fdedup_num_doc_actors": d_actors, - "fdedup_num_bucket_actors": b_actors, - "fdedup_num_minhash_actors": m_actors, - "fdedup_num_preprocessors": n_preprocessors, - "fdedup_num_permutations": num_permutations, - "fdedup_threshold": threshold, - "fdedup_shingles_size": shingles_size, - "fdedup_delimiters": delimiters, - "fdedup_random_delay_limit": random_delay_limit, - "fdedup_snapshot_delay": snapshot_delay, - "fdedup_use_doc_snapshot": use_doc_snapshot, - "fdedup_use_bucket_snapshot": use_bucket_snapshot, + "fdclean_document_id_column": id_column, + "fdclean_duplicate_list_location": duplicate_list_location, + "fdclean_operation_mode": operation_mode, } diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/universal/fdedup/python/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile new file mode 100644 index 000000000..071478870 --- /dev/null +++ b/transforms/universal/fdedup/python/Dockerfile @@ -0,0 +1,44 @@ +FROM docker.io/python:3.10.14-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} + +COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt + +RUN pip install --no-cache-dir -e . + +# copy source data +COPY src/ src/ + +# copy source data +COPY ./src/fdedup_transform_python.py fdedup_transform_python.py +COPY ./src/fdedup_transform_python.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile new file mode 100644 index 000000000..05f6bf5ca --- /dev/null +++ b/transforms/universal/fdedup/python/Makefile @@ -0,0 +1,64 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.python-venv + +test:: .transforms.python-test + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-python + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.python-test-image + +run-cli-sample: .transforms.run-cli-python-sample + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + +#run-s3-ray-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md new file mode 100644 index 000000000..34f18c73b --- /dev/null +++ b/transforms/universal/fdedup/python/README.md @@ -0,0 +1,11 @@ +# Fuzzy Dedup + +Please see the set of +[transform project conventions](../../../README.md) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary + +The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see +[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. \ No newline at end of file diff --git a/data-processing-lib/spark/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml similarity index 54% rename from data-processing-lib/spark/pyproject.toml rename to transforms/universal/fdedup/python/pyproject.toml index 89b4d9bf8..97be33d54 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,31 +1,21 @@ [project] -name = "data_prep_toolkit_spark" +name = "dpk_fdedup_transform_python" version = "0.2.2.dev2" -keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" -description = "Data Preparation Toolkit Library for Spark" +description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev2", - "pyspark>=3.5.2", - "psutil>=6.0.0", - "PyYAML>=6.0.2" -] - -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ @@ -44,7 +34,7 @@ dev = [ package_dir = ["src","test"] [options.packages.find] -where = ["src/data_processing_spark"] +where = ["src/"] [tool.pytest.ini_options] # Currently we use low coverage since we have to run tests separately (see makefile) diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt new file mode 100644 index 000000000..4e69a72e4 --- /dev/null +++ b/transforms/universal/fdedup/python/requirements.txt @@ -0,0 +1,10 @@ +data-prep-toolkit==0.2.2.dev2 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +scipy>=1.14.1, <2.0.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py new file mode 100644 index 000000000..03d5047ea --- /dev/null +++ b/transforms/universal/fdedup/python/src/Murmur_MH.py @@ -0,0 +1,112 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + +import logging +import os +from typing import List, Set + +import mmh3 +import numpy as np + + +class Murmur_MH: + def __init__(self, num_perm=64, seed=42, hashfunc=None): + self.seed = seed + self.num_perm = num_perm # the number of buckets, i.e. the vector length after self.minhash() call + self.permutations = self._init_permutations(seed, num_perm) + + def _init_permutations(self, seed, num_perm): + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + max_int = np.uint64((1 << 64) - 1) + # initialize pseudo random number generator with given seed value + gen = np.random.RandomState(seed) + # get self.num_perm pseudo random numbers between 2 and max_int (excl) + permutations = np.array( + [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], + dtype=np.uint64, + ).T + # make all even pseudo random numbers odd by adding 1 + permutations[permutations % 2 == 0] += 1 + return permutations + + def minhash(self, shingles: List[str]): + """return np.array of minhash""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + ) + + def minhash2(self, shingles: List[str], doc_len: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0), + doc_len, + ) + + def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + .tolist(), + doc_len, + doc_id, + ) + + @staticmethod + def jaccard(mh1: np.array, mh2: np.array) -> float: + """ + The Jaccard similarity measures the similarity between two sets of data + to see which members are shared and distinct. + + The Jaccard similarity is calculated by dividing the number of observations + in both sets by the number of observations in either set. + + Developed by Paul Jaccard, the index ranges from 0 to 1. + The closer to 1, the more similar the two sets of data. + + As a document is represented by a set. We use Jaccard distance to see how similar between two documents. + """ + assert len(mh1) == len(mh2) + return np.count_nonzero(mh1 == mh2) / len(mh1) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py new file mode 100644 index 000000000..bb785021c --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py new file mode 100644 index 000000000..a9822babe --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -0,0 +1,336 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +import re +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger +from Murmur_MH import Murmur_MH + + +short_name = "cluster" +cli_prefix = f"{short_name}_" + +# configuration keys +num_bands_key = "num_bands" +""" This key holds the number of bands used in the banding technique""" +num_segments_key = "num_segments" +""" This key holds the number of segments dividing the hashing space for each band""" +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" + +# command line arguments +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands used in the banding technique""" +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments dividing the hashing space for each band""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" + +captured_arg_keys = [ + num_bands_key, + num_segments_key, + jaccard_similarity_threshold_key, + sort_output_key, +] + +# defaults +num_bands_default = 14 +""" Default number of bands used in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments dividing the hashing space for each band""" +sort_output_default = False + + +class ClusterAnalysisTransform(AbstractFolderTransform): + """ + This is the second transform of the fuzzy dedup pipeline. It runs in parallel: + for each band, the hashing interval is divided into segments. A cluster analysis + uses as input all the parquet files from segment of a band. The `bands` output + of the signature calculation, the first transform in the fuzzy dedup pipeline + contains all the data for a given segment s of a specific band b in the + subfolder `bands/band=b/segment=s`. + The transform loads all the parquet files in the `bands/band=b/segment=s` + subfolder. Each one of these parquet files has two columns: the `band_hash` + and a `data` structure, which includes the `document_id`, the `minhashes` and + the `document_size` fields. Once all the files have been loaded in a single + dataframe, a `group_by` operation on the `band_hash` field is performed in + that dataframe. All the documents that have the same band_hash are grouped + in a cluster. Subsequently, the documents of each cluster are sorted in + descending order according to their size, and a Jaccard similarity is + calculated between the cluster documents. The documents for which the Jaccard + similarity is above the `jaccard_similarity_threshold` remain in the cluster, + the others are removed from the cluster. Finally, from each cluster that has + more than one document after running the Jaccard similarity, we select a doc + to keep (the largest size document), and mark the other documents as + duplicates. The resulting clusters are saved in a file for further analysis. + + Args: + num_bands: number of bands used in the banding technique + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + num_segments: the number of segments dividing the hashing space for each band + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_segments = config.get(num_segments_key, num_segments_default) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.sort_output = config.get(sort_output_key, sort_output_default) + self.data_access = config.get("data_access") + self.logger = get_logger(__name__) + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Cluster analysis for folder {folder_name}") + metadata = {} + input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) + else: + raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") + output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") + + # consolidate into a single data frame band hashes computed by workers + band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files) + metadata |= consolidation_stats + # cluster grouping by band hashes + cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe) + metadata |= cluster_stats + # cluster analysis using jaccard similarity + jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe) + metadata |= jaccard_stats + # Generate the docs_to_remove dataframe + docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") + output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow()) + self.logger.debug(f"{len(docs_to_remove_dataframe)} documents marked to remove") + metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} + return [(output_data, output_path)], metadata + + def sanitize_folder_name(self, folder_name: str) -> str: + if "://" in folder_name: + _, folder_name = folder_name.split("://") + if folder_name[-1] != "/": + folder_name = f"{folder_name}/" + return folder_name + + def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + band_segment_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + band_segment_dataframe = band_segment_dataframe.vstack(df) + + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": band_segment_dataframe.to_arrow().nbytes, + "consolidated_rows": len(band_segment_dataframe), + } + return band_segment_dataframe, consolidation_stats + + def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data") + cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter( + pl.col("cluster_length") > 1 + ) + # self.logger.info(f"file_name = {file_name}") + num_clusters = len(cluster_dataframe) + if num_clusters > 0: + sum_cdocs = cluster_dataframe.select(pl.sum("cluster_length")).item() + max_cdocs = cluster_dataframe.select(pl.max("cluster_length")).item() + min_cdocs = cluster_dataframe.select(pl.min("cluster_length")).item() + avg_cdocs = cluster_dataframe.select(pl.mean("cluster_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.debug(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + cluster_stats = { + "groupby_clusters": num_clusters, + "cluster_duplicate_docs": sum_cdocs, + } + return cluster_dataframe, cluster_stats + + def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + # Define the schema with specific data types + schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} + doc_ids_lists = [] + docs_to_remove_lists = [] + len_of_docs2remove_lists = [] + for row in df.iter_rows(named=True): + doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row) + doc_ids_lists += doc_ids_list + docs_to_remove_lists += docs_to_remove_list + len_of_docs2remove_lists += len_of_docs2remove_list + jaccard_cluster_dataframe = pl.DataFrame( + { + "first_doc": doc_ids_lists, + "docs_to_remove": docs_to_remove_lists, + "docs_to_remove_length": len_of_docs2remove_lists, + }, + schema=schema, + ) + filtered_jaccard_dataframe = jaccard_cluster_dataframe.filter(pl.col("docs_to_remove_length") > 0) + num_clusters = len(filtered_jaccard_dataframe) + if num_clusters > 0: + sum_cdocs = filtered_jaccard_dataframe.select(pl.sum("docs_to_remove_length")).item() + max_cdocs = filtered_jaccard_dataframe.select(pl.max("docs_to_remove_length")).item() + min_cdocs = filtered_jaccard_dataframe.select(pl.min("docs_to_remove_length")).item() + avg_cdocs = filtered_jaccard_dataframe.select(pl.mean("docs_to_remove_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.debug(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + jaccard_stats = { + "jaccard_clusters": num_clusters, + "jaccard_duplicate_docs": sum_cdocs, + } + if self.sort_output: + filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc") + return filtered_jaccard_dataframe, jaccard_stats + + def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: + # Process row and return a new list of Series or a new row + threshold = self.jaccard_similarity_threshold + doc_ids_list = [] + docs_to_remove_list = [] + len_of_docs2remove_list = [] + # sort documents + document_data = row["document_data"] + + # Sort the list by 'document_length' + sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"])) + + # Extracting int_id_column values into a list + doc_list = [item["int_id_column"] for item in sorted_document_data] + + # Creating a dictionary with int_id_column as key and minhashes as value + doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data} + + while len(doc_list) > 1: + docs_to_remove = [] + new_doc_list = [] + # this is the document we are going to keep + first_doc = doc_list[0] + first_mh = doc_minhashes[first_doc] + for int_id_column in doc_list[1:]: + doc_mh = doc_minhashes[int_id_column] + distance = Murmur_MH.jaccard(np.array(first_mh), np.array(doc_mh)) + if distance >= threshold: + docs_to_remove.append(int_id_column) + else: + new_doc_list.append(int_id_column) + if len(docs_to_remove) > 0: + docs_to_remove = list(set(docs_to_remove)) + doc_ids_list.append(first_doc) + docs_to_remove_list.append(docs_to_remove) + len_of_docs2remove_list.append(len(docs_to_remove)) + doc_list = new_doc_list + + return doc_ids_list, docs_to_remove_list, len_of_docs2remove_list + + +class ClusterAnalysisTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=ClusterAnalysisTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=float, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="The number of bands used in the banding technique", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="The number of segments dividing the hashing space for each band", + ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py new file mode 100644 index 000000000..c35c5a711 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -0,0 +1,76 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import time +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class ClusterAnalysisPythonRuntime(DefaultPythonTransformRuntime): + """ + Cluster analysis runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + +class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for Fuzzy Dedup ClusterAnalysis + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis python transform") + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py new file mode 100644 index 000000000..aa4aabb90 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py @@ -0,0 +1,60 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py new file mode 100644 index 000000000..74597068c --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -0,0 +1,179 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.data_access import DataAccessFactory +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger + + +short_name = "fdclean" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +duplicate_list_location_key = "duplicate_list_location" +""" This key holds the location of the list of duplicate documents marked for removal""" +operation_mode_key = "operation_mode" +""" This key holds the operation mode: 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" +""" Location of the list of duplicate documents marked for removal""" +operation_mode_cli_param = f"{cli_prefix}{operation_mode_key}" +""" Operation mode, can be one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" + +captured_arg_keys = [ + document_id_column_key, + duplicate_list_location_key, + operation_mode_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +duplicate_list_location_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") +""" Default location of the list of duplicate documents marked for removal""" +operation_mode_default = "filter_duplicates" +""" Default value for operation mode, will filter out all the duplicate documents""" + +dataclean_data_factory_key = "dc_data_factory" +dataclean_data_access_key = "dc_data_access" + + +class DataCleaningTransform(AbstractTableTransform): + """ + This is the third transform of the fuzzy dedup pipeline. It takes as input + the list of the documents to remove (identified as duplicates during the + cluster analysis phase, and the original dataset. Each dataset file is + imported into a table, and the documents that are in the documents to remove + list are filtered out from that table. The output is a new dataset, which + keeps the directory structure of the input dataset, but has all the fuzzy + duplicates removed. + + Args: + duplicate_location: location (local or s3) of the duplicate document list + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.logger = get_logger(__name__) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default) + self.operation_mode = config.get(operation_mode_key, operation_mode_default) + contents = config.get("df") + self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) + self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") + self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + input_df = pl.from_arrow(table) + # handle the case when the doc_id columns in the input dataframe and the + # docs_to_remove_df have different types, i.e. one is int32 and the + # other is int64 + input_doc_id_type = input_df[self.document_id_column].dtype + if input_doc_id_type != self.docs_to_remove_df[self.document_id_column].dtype: + self.docs_to_remove_df = self.docs_to_remove_df.select( + pl.col(self.document_id_column).cast(input_doc_id_type) + ) + if self.operation_mode == "filter_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") + elif self.operation_mode == "filter_non_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="inner") + else: # self.operation_mode == "annotation" + duplicates_df = self.docs_to_remove_df.with_columns(pl.lit("d").alias("duplicate")) + result_df = input_df.join(duplicates_df, on=self.document_id_column, how="left").with_columns( + pl.col("duplicate").fill_null("") + ) + result_table = result_df.to_arrow() + metadata = { + "input_files": 1, + "input_docs": table.num_rows, + "input_bytes": table.nbytes, + "output_files": 1, + "output_docs": result_table.num_rows, + "output_bytes": result_table.nbytes, + "filtered_docs": (table.num_rows - result_table.num_rows), + "filtered_bytes": (table.nbytes - result_table.nbytes), + } + return [result_table], metadata + + +class DataCleaningTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningTransform): + super().__init__( + name=short_name, + transform_class=transform_class, + remove_from_metadata=[dataclean_data_factory_key], + ) + self.daf = DataAccessFactory(cli_arg_prefix="dcdata_") + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{duplicate_list_location_cli_param}", + type=str, + default=duplicate_list_location_default, + help="location of duplicate document list that are marked for removal", + ) + parser.add_argument( + f"--{operation_mode_cli_param}", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + default=operation_mode_default, + help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) + self.daf.add_input_params(parser=parser) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + self.params[dataclean_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py new file mode 100644 index 000000000..edef8b9c5 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -0,0 +1,103 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + DefaultPythonTransformRuntime, + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class DataCleaningPythonRuntime(DefaultPythonTransformRuntime): + """ + Data cleaning runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = dc_data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for fuzzy dedup data cleaning step + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param: transform_configuration - transform configuration class + :param: runtime_class - name of the runtime configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(DataCleaningTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py new file mode 100644 index 000000000..b77f44401 --- /dev/null +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -0,0 +1,240 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import argparse +import ast +import os +import sys + +import cluster_analysis_transform +import data_cleaning_transform +import get_duplicate_list_transform +import signature_calc_transform +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils, get_logger, str2bool +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +SERVICE_DICT = { + "SignatureCalculation": "minhash", + "ClusterAnalysis": "cluster", + "GetDuplicateList": "fdlist", + "DataCleaning": "fdclean", +} + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + +ARGS_MAP = { + "minhash": signature_calc_transform.captured_arg_keys, + "cluster": cluster_analysis_transform.captured_arg_keys, + "fdlist": get_duplicate_list_transform.captured_arg_keys, + "fdclean": data_cleaning_transform.captured_arg_keys, +} + + +class ServiceOrchestrator: + def __init__(self, global_params: argparse.Namespace = None): + self.global_params = global_params + self.logger = get_logger(__name__) + + def orchestrate(self): + service_list = self.global_params.services.split(",") + for service in service_list: + self.logger.info(f"Starting {service} step") + if service not in SERVICE_DICT: + err_msg = f"Unknown service {service} specified. Must be one of {SERVICE_DICT.keys()}" + self.logger.error(err_msg) + raise ValueError(err_msg) + service_short_name = SERVICE_DICT[service] + service_params = self.get_arguments(self.global_params, service_short_name) + self.logger.info(f"Got parameters for {service}") + status = self.execute_service(service_short_name, service_params) + if status == 0: + self.logger.info(f"{service} completed successfully") + else: + self.logger.error(f"{service} failed with status {status}, aborting ...") + break + + def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: + sys_argv = ["python"] + in_args_dict = vars(in_args) + all_module_arguments = ARGS_MAP.get(service_name, []) + passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} + for k, v in passed_args.items(): + sys_argv.append(f"--{service_name}_{k}") + sys_argv.append(str(v)) + if service_name == "minhash": + input_folder = in_args_dict["input_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "cluster": + input_folder = os.path.join(in_args_dict["output_folder"], "bands") + output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") + elif service_name == "fdlist": + input_folder = in_args_dict["output_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "fdclean": + input_folder = in_args_dict["input_folder"] + operation_mode = in_args_dict.get("operation_mode", "filter_duplicates") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + output_folder = os.path.join(in_args_dict["output_folder"], output_subfolder) + else: + self.logger.error(f"Unknown service name: {service_name}") + data_io = { + "input_folder": input_folder, + "output_folder": output_folder, + } + if in_args.use_s3: + if in_args.s3_cred is not None: + s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred) + sys_argv.append("--data_s3_cred") + sys_argv.append(s3_cred_ast) + elif ( + s3_creds.get("access_key") is not None + and s3_creds.get("secret_key") is not None + and s3_creds.get("url") is not None + ): + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + sys_argv.append("--data_s3_config") + else: + sys_argv.append("--data_local_config") + sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + return sys_argv + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + elif service_short_name == "cluster": + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + else: + err_msg = f"Unknown service {service_short_name} specified. Must be one of {SERVICE_DICT.values()}" + self.logger.error(err_msg) + raise ValueError(err_msg) + status = launcher.launch() + return status + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Service Orchestrator") + + # Define command line arguments + parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") + parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") + + parser.add_argument( + "--operation_mode", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + required=False, + help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) + parser.add_argument( + "--contents_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument( + "--document_id_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument( + "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" + ) + parser.add_argument( + "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation" + ) + parser.add_argument( + "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band" + ) + parser.add_argument( + "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + ) + parser.add_argument( + "--jaccard_similarity_threshold", + type=float, + required=False, + help="jaccard similarity threshold above which two documents are similar", + ) + parser.add_argument( + "--num_segments", + type=int, + required=False, + help="the number of segments dividing the hashing space for each band (for scalability)", + ) + parser.add_argument( + "--duplicate_list_location", + type=str, + required=False, + help="path to the file with all the duplicate document ids", + ) + + # Single argument for service execution + parser.add_argument( + "--services", + type=str, + required=False, + default="SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning", + help="Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)", + ) + + parser.add_argument( + "--use_s3", + type=lambda x: bool(str2bool(x)), + default=False, + help="use s3", + ) + + parser.add_argument( + "--s3_cred", + type=ast.literal_eval, + default=None, + help="ast string of options for s3 credentials", + ) + parser.add_argument( + "--shingle_option", + type=str, + required=False, + default="word", + help="Option used for shingling", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + # Launch python fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py new file mode 100644 index 000000000..c49124cf1 --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -0,0 +1,184 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +import re +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger +from Murmur_MH import Murmur_MH + + +short_name = "fdlist" +cli_prefix = f"{short_name}_" + +# configuration keys +subfolder_key = "docs_to_remove" +""" This key holds the name of the subfolder with the duplicate records""" +consolidated_filename_key = "consolidated_filename" +""" This key holds the name of the file with the consolidated list of duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" + +# command line arguments +subfolder_cli_param = f"{cli_prefix}{subfolder_key}" +""" The name of the subfolder with the duplicate records""" +consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}" +""" The name of the file with the consolidated list of duplicates""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" + +captured_arg_keys = [ + subfolder_key, + consolidated_filename_key, + sort_output_key, +] + +# defaults +subfolder_default = "docs_to_remove" +""" Default name of the subfolder with the duplicate records""" +consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") +""" Default name of the file with the consolidated list of duplicates""" +sort_output_default = False + + +class GetDuplicateListTransform(AbstractFolderTransform): + """ + This is an intermediate step of the fuzzy dedup pipeline. It runs in a single + location and consolidates in a single file all the duplicates found for each + band segment. + Args: + subfolder: name of the subfolder with the duplicate records + consolidated_filename: name of the file with the consolidated list of duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.subfolder = config.get(subfolder_key, subfolder_default) + self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default) + self.sort_output = config.get(sort_output_key, sort_output_default) + self.data_access = config.get("data_access") + self.logger = get_logger(__name__) + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Get Duplicate List for folder {folder_name}") + metadata = {} + input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_path = os.path.join(output_folder, self.consolidated_filename) + + # consolidate into a single data frame band hashes computed by workers + consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files) + self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates") + metadata |= consolidation_stats + output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow()) + return [(output_data, output_path)], metadata + + def sanitize_folder_name(self, folder_name: str) -> str: + if "://" in folder_name: + _, folder_name = folder_name.split("://") + if folder_name[-1] != "/": + folder_name = f"{folder_name}/" + return folder_name + + def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + consolidated_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + consolidated_dataframe = consolidated_dataframe.vstack(df) + consolidated_dataframe = consolidated_dataframe.select("docs_to_remove").unique() + + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes, + "consolidated_rows": len(consolidated_dataframe), + } + if self.sort_output: + consolidated_dataframe = consolidated_dataframe.sort(by="docs_to_remove") + + return consolidated_dataframe, consolidation_stats + + +class GetDuplicateListTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=GetDuplicateListTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the GetDuplicateListTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{subfolder_cli_param}", + type=str, + default=subfolder_default, + help="The name of the subfolder with the duplicate records", + ) + parser.add_argument( + f"--{consolidated_filename_cli_param}", + type=str, + default=consolidated_filename_default, + help="The name of the file with the consolidated list of duplicates", + ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py new file mode 100644 index 000000000..34b18ab04 --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "cluster_analysis") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} + +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py new file mode 100644 index 000000000..703ef630e --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py @@ -0,0 +1,71 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListPythonRuntime(DefaultPythonTransformRuntime): + """ + Get duplicate list runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for Fuzzy Dedup GetDuplicateList + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list python transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py new file mode 100644 index 000000000..be395ed4d --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -0,0 +1,51 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys +from ast import Param + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + + # create launcher + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py new file mode 100644 index 000000000..6b14e1ba0 --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -0,0 +1,519 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +import re +import unicodedata +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Any, List + +import mmh3 +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.data_access import DataAccessFactory +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider +from Murmur_MH import Murmur_MH + + +short_name = "minhash" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +contents_column_key = "contents_column" +""" This key holds the name of the column storing the contents of each document""" +seed_key = "seed" +""" This key holds the seed used to instantiate the random number generator""" +num_permutations_key = "num_permutations" +""" This key holds the number of permutations that determine how many minhashes to calculate for each document""" +num_bands_key = "num_bands" +""" This key holds the number of bands to use in the banding technique""" +num_minhashes_per_band_key = "num_minhashes_per_band" +""" This key holds the number of minhashes to use in each band""" +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_key = "word_shingle_size" +""" This key holds the size of the word shingles calculated for each document""" +num_segments_key = "num_segments" +""" This key holds the number of segments across which we divide the hashing space for each band""" +shingle_option_key = "shingle_option" +""" This key holds the option that is used to do shingles calculation for each document""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +contents_column_cli_param = f"{cli_prefix}{contents_column_key}" +""" Name of the column storing the contents of each document""" +seed_cli_param = f"{cli_prefix}{seed_key}" +""" The seed used to instantiate the random number generator""" +num_permutations_cli_param = f"{cli_prefix}{num_permutations_key}" +""" Number of permutations that determine how many minhashes to calculate for each document""" +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands to use in the banding technique""" +num_minhashes_per_band_cli_param = f"{cli_prefix}{num_minhashes_per_band_key}" +""" The number of minhashes to use in each band""" +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_cli_param = f"{cli_prefix}{word_shingle_size_key}" +""" The size of the word shingles calculated for each document""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments across which we divide the hashing space for each band""" +shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}" +""" The option (word/char) used to do shingles calculation for each document""" + +captured_arg_keys = [ + document_id_column_key, + contents_column_key, + seed_key, + num_bands_key, + num_minhashes_per_band_key, + num_permutations_key, + jaccard_similarity_threshold_key, + word_shingle_size_key, + num_segments_key, + shingle_option_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +contents_column_default = "contents" +""" Default name of the column storing the contents of each document""" +seed_default = 42 +""" Default seed used to instantiate the random number generator""" +num_permutations_default = 112 +""" Default number of minhashes used for each document (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_bands_default = 14 +""" Default number of bands to use in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_minhashes_per_band_default = 8 +""" Default number of minhashes to use in each band (from FineWeb https://arxiv.org/pdf/2406.17557)""" +word_shingle_size_default = 5 +""" Default size of the word shingles (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments across which we divide the hashing space for each band""" +shingle_option_default = "word" +""" Default option of doing shingling""" + + +sigcalc_data_factory_key = "sc_data_factory" +sigcalc_data_access_key = "sc_data_access" + + +NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") +WHITESPACE_PATTERN = re.compile(r"\s+") +PUNCTUATION = "!/—”:%1〈&(、━\\【#%「」,】;+^]~“《„';’{|∶´[=-`*.(–?!:$~«〉,><》)?)。…@_.\"}►»" + "".join( + map( + chr, + (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)), + ) +) +PUNCTUATION_SET = set(PUNCTUATION) +PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) + + +class SignatureCalculationTransform(AbstractTableTransform): + """ + This is the first transform of the fuzzy dedup pipeline. First, it calculates, + for each document in a dataset, `num_permutations` minhashes. It accepts as + input the number of bands and the length of each band. If those two parameters + are not specified, then, based on the values of `jaccard_similarity_threshold` + and `num_permutations`, it determines the optimal number of bands, and the + length of each band (how many minhashes will be used to get the signature for + each band). The band signatures, the minhashes and the document lengths are + then saved in the output folder, under a folder structure `bands/band=b/segment=s`. + To improve scalability of the next step of fuzzy dedup, the hash space of + each band is divided into `num_segments` segments. + + Args: + document_id_column: name of the column storing the unique ID assigned to each document + contents_column_cli_param: name of the column storing the contents of each document + seed: the seed used to instantiate the random number generator + num_permutations: number of minhashes to calculate for each document + num_bands: number of bands to use for banding technique + num_minhashes_per_band: number of minhashes to use in each band + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + word_shingle_size: the size of the word shingles calculated for each document + num_segments: the number of segments across which we divide the hashing space for each band + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, SignatureCalculationTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + super().__init__(config) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.contents_column = config.get(contents_column_key, contents_column_default) + self.seed = config.get(seed_key, seed_default) + self.num_permutations = config.get(num_permutations_key, num_permutations_default) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.word_shingle_size = config.get(word_shingle_size_key, word_shingle_size_default) + self.num_segments = config.get(num_segments_key, num_segments_default) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + self.shingle_option = config.get(shingle_option_key, shingle_option_default) + # use this dataframe to store the minhashes and size for each document + self.all_minhashes: pl.DataFrame = None + # use this dataframe to store the band hashes for each document + self.all_band_hashes: pl.DataFrame = None + # this variable keeps track of how many files were processed since last + # data write to properly update metadata + self.files_processed = 0 + self.bytes_processed = 0 + self.data_access = config.get("data_access") + self.last_file_name = None + self.sc_data_access = config.get(sigcalc_data_access_key, None) + if self.sc_data_access is None: + self.sc_daf = config.get(sigcalc_data_factory_key, None) + if self.sc_daf is None: + raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") + self.sc_data_access = self.sc_daf.create_data_access() + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug("----minhash---") + self.last_file_name = file_name + self.files_processed += 1 + self.bytes_processed += table.nbytes + # instantiate with same seed so every worker use same hash functions + mm_min_hash = Murmur_MH(num_perm=self.num_permutations, seed=self.seed) + + # load the data from pyarrow table + df = pl.from_arrow(table) + # read the target columns + df = df.select(self.contents_column, self.document_id_column) + + # generate minhash values + minhashes = df.map_rows( + lambda row: mm_min_hash.minhash2_nosalt( + *self._generate_word_shingles(row, self.shingle_option, window_size=self.word_shingle_size) + ) + ) + # rename columns, cast minhashes to list(uint32) + minhashes = minhashes.select( + pl.col("column_2").alias(self.document_id_column), + pl.col("column_0").cast(pl.List(pl.UInt32)).alias("minhashes"), + pl.col("column_1").alias("document_length"), + ) + # store the minhash calculations to send out at the end of execution + if self.all_minhashes is None: + self.all_minhashes = minhashes + else: + self.all_minhashes = self.all_minhashes.vstack(minhashes) + + # Calculate band hashes + band_hashes_list = self.process_rows_into_bands( + minhashes, + self.num_bands, + self.num_rows, + ) + band_hash_schema = pl.Schema( + { + "band_hash": pl.UInt64, + "band_index": pl.Int32, + self.document_id_column: pl.Int64, + } + ) + band_hashes = pl.DataFrame(band_hashes_list, schema=band_hash_schema) + + # store the band hash calculations to send out at the end of execution + if self.all_band_hashes is None: + self.all_band_hashes = band_hashes + else: + self.all_band_hashes = self.all_band_hashes.vstack(band_hashes) + + if len(self.all_minhashes) > 750000: + tables, metadata = self.write_band_signatures() + else: + tables = [] + metadata = {} + # update metadata stats and return the stats (no tables are returned in transform) + return tables, metadata + + def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: + """ + This is supporting method for transformers, that implement buffering of tables, for example coalesce. + These transformers can have buffers containing tables that were not written to the output. Flush is + the hook for them to return back locally stored tables and their statistics. The majority of transformers + should use default implementation. + If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray. + :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be + propagated to metadata + """ + self.logger.info(f"Starting flush()") + if self.all_band_hashes is not None and self.all_minhashes is not None: + tables, metadata = self.write_band_signatures() + else: + tables = [] + metadata = {} + return tables, metadata + + def write_band_signatures(self): + # define the upper and lower bounds of each band segment + segment_bounds_list = [] + upper_bound = np.uint64(np.iinfo(np.uint64).max) + segment_len = np.uint64(upper_bound // self.num_segments) + for segment_index in range(self.num_segments): + segment_bounds_list.append(np.uint64(segment_index) * segment_len) + segment_bounds_list.append(upper_bound) + segment_bounds = np.array(segment_bounds_list, dtype=np.uint64) + self.logger.debug(f"Calculated {len(segment_bounds)} segment_bounds") + # output stats for the metadata + num_tables_written = 0 + num_docs_written = 0 + num_bytes_written = 0 + self.logger.debug(f"dataframe self.all_band_hashes has {len(self.all_band_hashes)} rows") + self.logger.debug(f"dataframe self.all_minhashes has {len(self.all_minhashes)} rows") + # iterate through the bands, get the band hashes for each band, divide + # them into segments, join with minhashes, and upload to storage + for band_ix in range(self.num_bands): + # Filtering on, then dropping the `band_index` column + band_df = self.all_band_hashes.filter(pl.col("band_index") == band_ix).drop("band_index") + # assign each band hash to a segment of the hashing space + self.logger.debug(f"band {band_ix} band_df has {len(band_df)} rows") + for segment_index in range(self.num_segments): + segment_band_df = band_df.filter( + (pl.col("band_hash") > segment_bounds[segment_index]) + & (pl.col("band_hash") <= segment_bounds[segment_index + 1]) + ) + self.logger.debug( + f"band {band_ix} segment {segment_index} segment_band_df has {len(segment_band_df)} rows" + ) + # join the band hash dataframe with the minihash and doc length dataframe + segment_band_minhash_df = segment_band_df.join( + self.all_minhashes, + on=self.document_id_column, + how="inner", + ) + self.logger.debug(f"band {band_ix} segment {segment_index} joined segment_band_df and minhashes") + + # encapsulate document info in a structure + segment_band_minhash_df = segment_band_minhash_df.select( + pl.col("band_hash"), + pl.struct( + [ + pl.col(self.document_id_column), + pl.col("minhashes"), + pl.col("document_length"), + ] + ).alias("document_data"), + ) + self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure") + + # append the table to the result list, and the path to metadata + common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) + last_file_name_path = Path(self.last_file_name) + suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) + if self.sc_data_access.output_folder is None: + self.sc_data_access.output_folder = self.data_access.output_folder + save_path = os.path.join( + self.sc_data_access.output_folder, + "bands", + f"band={band_ix}", + f"segment={segment_index}", + suffix_path, + ) + segment_band_minhash_table = segment_band_minhash_df.to_arrow() + bytes_written, _, _ = self.sc_data_access.save_table(save_path, segment_band_minhash_table) + if bytes_written > 0: + num_tables_written += 1 + num_docs_written += segment_band_minhash_table.num_rows + num_bytes_written += bytes_written + self.logger.debug(f"Uploaded table for band {band_ix} and segment {segment_index}") + # add the stats to metadata + metadata = { + "input_files": self.files_processed, + "input_docs": len(self.all_minhashes), + "input_bytes": self.bytes_processed, + "output_files": num_tables_written, + "output_docs": num_docs_written, + "output_bytes": num_bytes_written, + } + self.logger.info(f"Wrote {num_tables_written} tables with a total size of {num_bytes_written:,d} bytes") + self.files_processed = 0 + self.bytes_processed = 0 + self.all_minhashes = None + self.all_band_hashes = None + return [], metadata + + # define shingles generation function + def _generate_word_shingles( + self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " " + ) -> tuple[list, int, int]: + text = row[0] + # lower case + text = text.lower() + # replace numbers with '0' + text = NUMBERS_PATTERN.sub("0", text) + # convert punctuation to spaces + text = text.translate(PUNCTUATION_TRANS) + # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end + text = WHITESPACE_PATTERN.sub(" ", text.strip()) + # diacritics/unicode normalization + text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") + text = text.strip() + self.logger.debug(shingling_option) + if shingling_option == "char": + words = list(text) + else: + words = text.split() + document_id = row[1] + doc_len = len(row[0]) + word_count = len(words) + k_shingles = [] + for i in range(0, max(1, word_count - window_size + 1)): + k_shingles.append(delimiter.join(words[i : i + window_size])) + return k_shingles, doc_len, document_id + + def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42): + num_minhashes = len(minhashes) + assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}" + results = [] + for band_index in range(b): + band_hash, _ = mmh3.hash64( + minhashes[band_index * r : (band_index + 1) * r], + seed=seed, + signed=False, + ) + results.append((band_hash, band_index, int_id_column)) + return results + + # Apply the function + def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): + result = [] + for row in df.iter_rows(): + bands = self.emit_bands( + row[0], # document id + np.array(row[1], dtype=np.uint32), # minhashes + row[2], # document length + minhashlsh_num_bands, + minhashlsh_length_band, + ) + for band in bands: + result.append(band) + return result + + +class SignatureCalculationTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=SignatureCalculationTransform, + remove_from_metadata=[sigcalc_data_factory_key], + ) + self.daf = DataAccessFactory(cli_arg_prefix="scdata_") + + from data_processing.utils import get_logger + + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{contents_column_cli_param}", + type=str, + default=contents_column_default, + help="name of the column storing the contents of each document", + ) + parser.add_argument( + f"--{seed_cli_param}", + type=int, + default=seed_default, + help="the seed used to instantiate the random number generator", + ) + parser.add_argument( + f"--{num_permutations_cli_param}", + type=int, + default=num_permutations_default, + help="number of permutations (minhashes) calculated for each document", + ) + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=float, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + parser.add_argument( + f"--{word_shingle_size_cli_param}", + type=int, + default=word_shingle_size_default, + help="the size of the word shingles calculated for each document", + ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="the number of bands to use in the banding technique", + ) + parser.add_argument( + f"--{num_minhashes_per_band_cli_param}", + type=int, + default=num_minhashes_per_band_default, + help="the number of minhashes to use in each band", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="the number of segments across which we divide the hashing space for each band", + ) + parser.add_argument( + f"--{shingle_option_cli_param}", + type=str, + default=shingle_option_default, + help="Shingling option", + ) + self.daf.add_input_params(parser=parser) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + self.params[sigcalc_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py new file mode 100644 index 000000000..5ddc102eb --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json new file mode 100644 index 000000000..de47f367b --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:34:04", + "end_time": "2024-10-18 10:34:04", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 96.1, + "gpus": 0, + "memory": 23.82, + "object_store": 0, + "execution time, min": 0.006 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.341, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 000000000..f5da05a10 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet new file mode 100644 index 000000000..0e089dee3 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 000000000..4b0fecb15 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 000000000..5601f5cb0 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 000000000..02bedff1c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 000000000..bf131f43c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet new file mode 100644 index 000000000..d41b35de2 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 000000000..2838dd972 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 000000000..7cb2cbac4 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 000000000..9de625746 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 000000000..8e1fe121e Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 000000000..37aea5168 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 000000000..3d1f158e9 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json new file mode 100644 index 000000000..c08326355 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "cluster", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "jaccard_similarity_threshold": 0.7, + "num_bands": 14, + "num_segments": 2, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 91.7, + "gpus": 0, + "memory": 24.01, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 28, + "result_size": 38040, + "processing_time": 0.061, + "input_files": 28, + "input_bytes": 115324, + "input_rows": 168, + "consolidated_files": 28, + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json new file mode 100644 index 000000000..717d9bbe9 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:10:22", + "end_time": "2024-10-18 10:10:23", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.7, + "gpus": 0, + "memory": 24.17, + "object_store": 0, + "execution time, min": 0.005 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.308, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..edbd80b43 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..34b15a76c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json new file mode 100644 index 000000000..ba1f5b0a6 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 13:22:42", + "end_time": "2024-10-18 13:22:42", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 32.5, + "gpus": 0, + "memory": 13.31, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.047, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..c7d3d8072 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..c355b299a Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..ad59ee31c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..fb2a0b13d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..aca2026d8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..1a46cb40f Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..56934cab8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..f82d9daca Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..842ce2caa Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..fcb03c17a Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..84c399e67 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..79a6f24b3 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..e67164596 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..cd2e75eaa Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..5212dff6d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..d0f1bd9b4 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..1cc7b2c26 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..f892d384d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..1a786300b Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..bc20a7699 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..151008dc4 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..b485d3882 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..0da33db3c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..1e1b4765c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..7e9af93b0 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..d112e179e Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..f3f7d2a7d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..06444accf Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..8a62a81b2 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:08:23", + "end_time": "2024-10-18 10:08:23", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.8, + "gpus": 0, + "memory": 24.15, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.006, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet new file mode 100644 index 000000000..c9220bf39 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet new file mode 100644 index 000000000..23fac4c72 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py new file mode 100644 index 000000000..cecd224fe --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py @@ -0,0 +1,48 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import sort_output_cli_param +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + basedir + "/expected/signature_calc/bands", + basedir + "/expected/cluster_analysis/docs_to_remove", + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py new file mode 100644 index 000000000..8c4debed9 --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + } + launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py new file mode 100644 index 000000000..9ad8a32d7 --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py @@ -0,0 +1,40 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + } + launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")] + return fixtures diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 0b2e9cf1a..4bfe32a9e 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -1,5 +1,4 @@ -ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - +ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 FROM ${BASE_IMAGE} RUN pip install --upgrade --no-cache-dir pip @@ -14,24 +13,31 @@ COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml COPY --chown=ray:users README.md README.md -COPY --chown=ray:users images/ images/ +COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . -# copy the main() entry point to the image -COPY ./src/fdedup_transform_ray.py . - -# copy some of the samples in -COPY src/fdedup_local_ray.py local/ +# copy source files needed by test-image +COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py +COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py +COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py +COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py # copy test COPY test/ test/ COPY test-data/ test-data/ +USER root +RUN chmod a+rwx /home/ray +USER ray # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index f5f06c3c3..ec193b6c3 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -43,7 +43,7 @@ setup:: .transforms.setup # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 923cbdf82..cb8c6306a 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -6,20 +6,16 @@ description = "fdedup Ray Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "mmh3>=4.1.0", - "xxhash==3.4.1", - "tqdm==4.66.3", - "scipy>=1.12.0, <2.0.0" + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt new file mode 100644 index 000000000..6ee40ef7f --- /dev/null +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit[ray]==0.2.2.dev2 +dpk_fdedup_transform_python==0.2.2.dev2 +mmh3>=4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py new file mode 100644 index 000000000..c54ba85c2 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -0,0 +1,53 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py new file mode 100644 index 000000000..a0e8e7de2 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py @@ -0,0 +1,74 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, + RayTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisRayRuntime(DefaultRayTransformRuntime): + """ + Cluster analysis runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + +class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisRayRuntime, + ) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis ray transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/compute_shingles.py b/transforms/universal/fdedup/ray/src/compute_shingles.py deleted file mode 100644 index 2db75ebe2..000000000 --- a/transforms/universal/fdedup/ray/src/compute_shingles.py +++ /dev/null @@ -1,50 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import string - - -""" -This implements the most simplistic splitting of document based on the white spaces -that can be overwritten by a different document splitter (tokenizer). This method is -build in the library and can be overwritten using approach described at -https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library - -import compute_shingles -compute_shingles.compute_shingles = my_local_compute_shingles -""" - - -def _find(s: str, ch: str) -> list[int]: - """ - Get indexes of all locations of character in string - :param s: string - :param ch: character - :return: list of locations - """ - return [i for i, ltr in enumerate(s) if ltr == ch] - - -def compute_shingles(txt: str, word_shingle_size: int, delimiter: str = " ") -> list[str]: - """ - Generate word shingles - :param txt: document - :param delimiter: delimiter to split document - :param word_shingle_size: size of shingle in words - :return: list of shingles - """ - text = txt.replace("\n", "").lower().translate(str.maketrans("", "", string.punctuation)) - separators = _find(text, delimiter) - if len(separators) + 1 <= word_shingle_size: - return [text] - bounds = [-1] + separators + [len(text)] - return [text[bounds[i] + 1 : bounds[i + word_shingle_size]] for i in range(0, len(bounds) - word_shingle_size)] diff --git a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py similarity index 59% rename from transforms/universal/fdedup/ray/src/fdedup_local_ray.py rename to transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py index af7bec71c..b951e2fc8 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py @@ -13,59 +13,57 @@ import os import sys +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) worker_options = {"num_cpus": 0.8} + code_location = {"github": "github", "commit_hash": "12345", "path": "path"} params = { # where to run "run_locally": True, # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 1, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_creation_delay": 0, "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, } -sys.argv = ParamsUtils.dict_to_req(d=params) -# launch -launcher.launch() + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py new file mode 100644 index 000000000..88171e260 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -0,0 +1,138 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +import ray +from data_cleaning_transform import ( + DataCleaningTransform, + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, +) +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from ray.actor import ActorHandle + + +logger = get_logger(__name__) + + +class DataCleaningRayTransform(DataCleaningTransform): + """ """ + + def __init__(self, config: dict): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, LangSelectorTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + docs2removedf = config.get("df", None) + if docs2removedf is not None: + # This is recommended for production approach. In this case domain list is build by the + # runtime once, loaded to the object store and can be accessed by actors without additional reads + try: + config["df"] = ray.get(config.get("df")) + except Exception as e: + self.logger.warning(f"Exception loading docs2remove list from ray object storage {e}") + raise RuntimeError(f"exception loading from object storage for key {docs2removedf}") + super().__init__(config) + + +class DataCleaningRuntime(DefaultRayTransformRuntime): + """ + Ingest Data cleaning runtime support + """ + + def __init__(self, params: dict[str, Any]): + """ + Create filter runtime + :param params: parameters, that should include + ingest_supported_langs_file_key: supported languages file + ingest_detect_programming_lang_key: whether to detect programming language + ingest_domain_key: domain + ingest_snapshot_key: snapshot + """ + super().__init__(params) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def get_transform_config( + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], + ) -> dict[str, Any]: + """ + Set environment for filter execution + :param data_access_factory - data access factory + :param statistics - reference to the statistics object + :param files - list of files to remove + :return: dictionary of filter init params + """ + data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = dc_data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) + docs_to_remove_list = ray.put(duplicate_list) + return {"df": docs_to_remove_list} | self.params + + +class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(transform_class=DataCleaningRayTransform), + runtime_class=DataCleaningRuntime, + ) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py b/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py deleted file mode 100644 index 285fcfa22..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py +++ /dev/null @@ -1,76 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import sys - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) -# create parameters -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} - -s3_conf = { - "input_folder": "test/fdedup/input", - "output_folder": "test/fdedup/output", -} -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 5, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 2, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, -} -sys.argv = ParamsUtils.dict_to_req(d=params) - - -# launch -launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_support.py b/transforms/universal/fdedup/ray/src/fdedup_support.py deleted file mode 100644 index 60afb84bf..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_support.py +++ /dev/null @@ -1,621 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import pickle -import time -from typing import Any, Iterator, Union - -import numpy as np -import ray -from data_processing.data_access import SnapshotUtils -from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger -from data_processing_ray.runtime.ray import RayUtils -from ray.actor import ActorHandle -from ray.util import ActorPool -from scipy.integrate import quad as integrate - - -NO_SIMILARITY = -1 -REQUEST_LEN = 4096 -LONG_BUCKET = 5000 -LONG_BUCKET_PRINT = 1000 - - -def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, -) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt - - -class MurmurMH: - def __init__(self, num_perm: int, seed: int = RANDOM_SEED): - self.seed = seed - self.num_perm = num_perm - self.permutations = self._init_permutations(seed, num_perm) - - def minhash(self, shingle_count: int, shingles: Iterator[str]) -> np.array: - def generator(): - for shingle in shingles: - yield TransformUtils.str_to_int(shingle) - - hash_values = np.fromiter(generator(), dtype=np.uint64, count=shingle_count) - - result = np.zeros(self.permutations.shape, dtype=np.uint32) - for i, perm in enumerate(self.permutations): - result[i] = np.right_shift((perm * hash_values).T, 32).astype(np.uint32).min(axis=0, keepdims=False) - return result - - @staticmethod - def _init_permutations(seed: int, num_perm: int) -> np.array: - # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic - max_int = np.uint64((1 << 64) - 1) - gen = np.random.RandomState(seed) - # get self.num_perm pseudo random numbers between 2 and max_int (excl) - permutations = np.array([gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], dtype=np.uint64).T - # make all even pseudo random numbers odd by adding 1 - permutations[permutations % 2 == 0] += 1 - return permutations - - @staticmethod - def jaccard(mh1: np.array, mh2: np.array) -> float: - return np.count_nonzero(mh1 == mh2) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocCollector: - """ - An actor collecting de duped document IDs - """ - - def __init__(self, params: dict[str, Any]): - """ - Initializer - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - self.removed = set() - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.ids = {} - else: - try: - bids, _ = self.data_access.get_file(snapshot) - self.ids = pickle.loads(bids) - except Exception as e: - self.logger.warning(f"Failed to load doc collector {self.actor_id} with exception {e}") - raise e - - def add_documents(self, dr: tuple[list[tuple[int, int]], list[int]]) -> None: - """ - Add documents and removed document - :param dr: documents to keep and documents to remove - :return: - """ - docs = dr[0] - rm = dr[1] - # process documents to remove - for did in rm: - self.ids.pop(did, None) - self.removed.update(rm) - # process documents to keep - for key, val in docs: - if key in self.removed: - continue - if key in self.ids and val == NO_SIMILARITY: - # Do not update existing docs with NO_SIMILARITY - continue - else: - self.ids[key] = val - - def filter(self, docs: list[int]) -> dict[int, int]: - """ - Filter documents - :param docs: documents to filter - :return: documents to keep - """ - result = {} - for doc_id in docs: - r = self.ids.get(doc_id, None) - if r is not None: - result[doc_id] = r - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.ids) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}docs/doc_collector_{self.actor_id}", b_doc - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float, int, float]: - """ - get sizes - :return: number of ids, its memory utilization, number of removed, its memory utilization - """ - return ( - len(self.ids), - TransformUtils.deep_get_size(self.ids) / GB, - len(self.removed), - TransformUtils.deep_get_size(self.removed) / GB, - ) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocsMinHash: - """ - An actor storing min hashes for a doc id - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialize - :param params: parameters - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.docs = {} - else: - try: - bdocs, _ = self.data_access.get_file(snapshot) - self.docs = pickle.loads(bdocs) - except Exception as e: - self.logger.warning(f"Failed to load minhash collector {self.actor_id} with exception {e}") - raise e - - def add_minhashes(self, updates: list[tuple[int, int, np.array]]) -> None: - """ - Add minhashes - :param updates: minhash for doc_id a tuple of doc len and array of hashes - :return: None - """ - for doc_id, length, minhash in updates: - self.docs[doc_id] = np.concatenate(([length], minhash)) - - def get_minhashes(self, doc_ids: list[int]) -> list[tuple[int, int, np.array]]: - """ - Get minhashes for a list of documents - :param doc_ids: list of doc ids - :return: doc id, len, minhashes - """ - result = [] - for doc_id in doc_ids: - info = self.docs.get(doc_id) - if info is not None: - result.append((doc_id, info[0], info[1:])) - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.docs) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}minhash/minhash_collector_{self.actor_id}", - b_doc, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot minhash collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get size of used min hashes - :return: number of docs, its memory utilization - """ - return len(self.docs), TransformUtils.deep_get_size(self.docs) / GB - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHash: - """ - Actor storing buckets information - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialization - """ - from ray.util.metrics import Counter - - self.submitter = None - self.n_buckets = 0 - self.bucket_memory = 0 - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.buckets = {} - else: - try: - b_buckets, _ = self.data_access.get_file(snapshot) - self.buckets = pickle.loads(b_buckets) - except Exception as e: - self.logger.warning(f"Failed to load buckets collector {self.actor_id} with exception {e}") - raise e - self.bucket_created_counter = Counter("bucket_created", "Amount of buckets created") - self.long_bucket_submit_counter = Counter("long_bucket_submitted", "Amount of long buckets submitted") - self.short_bucket_submit_counter = Counter("short_bucket_submitted", "Amount of short buckets submitted") - - def add_buckets(self, bck: list[tuple[int, list[int]]]) -> None: - """ - Add additional buckets to hash - :param bck: bucket information - :return: None - """ - for bucket in bck: - b_hash = bucket[0] - buckets_for_hash = self.buckets.get(b_hash) - if buckets_for_hash: - if type(buckets_for_hash) == int: - self.buckets[b_hash] = [buckets_for_hash] + bucket[1] - else: - buckets_for_hash.extend(bucket[1]) - else: - if len(bucket[1]) == 1: - self.buckets[b_hash] = bucket[1][0] - else: - self.buckets[b_hash] = bucket[1] - self.bucket_created_counter.inc(1) - - def add_processing_submitter(self, submitter: ActorHandle) -> None: - """ - Add process submitter - :param submitter: reference to submitter - :return: - """ - self.submitter = submitter - - def process_buckets(self) -> None: - """ - Process buckets to generate documents - :return: None - """ - - # Remember usage - self.n_buckets = len(self.buckets) - self.bucket_memory = TransformUtils.deep_get_size(self.buckets) / GB - - # split buckets into short and long. Long buckets can take very long to process - long_buckets = [] - short_buckets = [] - while len(self.buckets) > 0: - doc_id, bucket = self.buckets.popitem() - if type(bucket) == list and len(bucket) > LONG_BUCKET: - # Its long - long_buckets.append(bucket) - else: - short_buckets.append(bucket) - self.logger.info(f"processing buckets {len(long_buckets)} long, {len(short_buckets)} short") - - # process long buckets first - we are submitting them one at a time - for bucket in long_buckets: - if len(bucket) > 2 * LONG_BUCKET: - # For very long buckets, split them - self.logger.info(f"Splitting bucket of length len(bucket) into chunks") - smaller_bucket = [ - bucket[i * LONG_BUCKET : (i + 1) * LONG_BUCKET] - for i in range((len(bucket) + LONG_BUCKET - 1) // LONG_BUCKET) - ] - for b in smaller_bucket: - ray.get(self.submitter.submit_for_processing.remote([b])) - self.long_bucket_submit_counter.inc(1) - else: - ray.get(self.submitter.submit_for_processing.remote([bucket])) - self.long_bucket_submit_counter.inc(1) - self.logger.info("Done submitting long buckets") - - # And now the rest of buckets - bucket_chunks = [short_buckets[i * 100 : (i + 1) * 100] for i in range((len(short_buckets) + 99) // 100)] - for b in bucket_chunks: - ray.get(self.submitter.submit_for_processing.remote(b)) - self.short_bucket_submit_counter.inc(len(b)) - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_buckets = pickle.dumps(self.buckets) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}buckets/buckets_collector_{self.actor_id}", - b_buckets, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot buckets collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get buckets resource utilization - :return: number of buckets and memory utilization - """ - return self.n_buckets, self.bucket_memory - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessor: - """ - Actor for processing buckets - """ - - def __init__(self, params: dict[str, Any]): - """ - Init method - :param params - dictionary of parameters containing the following keys - remote_docs - handles to the remote docs - remote_minhashes - handles to the remote minhashes - mn_min_hash - MurmurMH class - threshold - threshold - statistics - statistics actor - """ - from ray.util.metrics import Counter - - self.threshold = params["threshold"] - self.mn_min_hash = params["mn_min_hash"] - self.remote_docs = params["remote_docs"] - self.remote_minhashes = params["remote_minhashes"] - self.stats = params["statistics"] - self.logger = get_logger(__name__) - self.bucket_processed_counter = Counter("bucket_processed", "Amount of buckets processed") - - def _submit_generated_docs(self, docs: dict[int, int], removed: set[int]) -> None: - """ - Submit generated documents - :param docs: docs to submit - :param removed: removed documents - :return: None - """ - # Remove doc ids that are already removed - for did in removed: - docs.pop(did, None) - # Build remote requests - request = [([], []) for _ in range(len(self.remote_docs))] - for key, value in docs.items(): - req_tuple = request[key % len(self.remote_docs)] - req_tuple[0].append((key, value)) - for did in removed: - req_tuple = request[did % len(self.remote_docs)] - req_tuple[1].append(did) - # Submit requests and wait for replies - remote_replies = [] - i = 0 - for req in request: - if len(req[0]) > 0 or len(req[1]) > 0: # Only submit if the request has data - remote_replies.append(self.remote_docs[i].add_documents.remote(req)) - i += 1 - # Process replies - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - # get minhashes and length for docs in the bucket - def _get_minhashes_docs(self, doc_ids: list[int]) -> dict[int, tuple[int, list[int]]]: - """ - Get minhashes for documents by submitting requests to an appropriate doc collectors - :param doc_ids: doc ids - :return: doc ids with hashes - """ - request = [[] for _ in range(len(self.remote_minhashes))] - for value in doc_ids: - request[value % len(self.remote_minhashes)].append(value) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.remote_minhashes[i].get_minhashes.remote(req)) - i += 1 - # Process replies - hashes = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - for r in reply: - hashes[r[0]] = (r[1], r[2]) - remote_replies = not_ready - return hashes - - def process_buckets(self, buckets: list[Union[int, list[int]]]) -> None: - """ - process buckets to generate documents - :param buckets: buckets - :return: none - """ - t_start = time.time() - docs = {} - removed = set() - for bucket in buckets: - if type(bucket) == int: - # This hash has a single document - if bucket not in docs: - docs[bucket] = NO_SIMILARITY - self.bucket_processed_counter.inc(1) - continue - # multiple documents - start = time.time() - bucket_len = len(bucket) - very_long = bucket_len > LONG_BUCKET - - hashes = self._get_minhashes_docs(bucket) - set_list = [] - unvisited = set(bucket) - - # combine similar documents - index = 0 - while len(unvisited) > 0: - current_doc_id = unvisited.pop() - current_mh = hashes[current_doc_id][1] - current_set = set() - for other_doc_id in bucket: - if other_doc_id in unvisited: - other_mh = hashes[other_doc_id][1] - if self.mn_min_hash.jaccard(current_mh, other_mh) >= self.threshold: - current_set.add(current_doc_id) - current_set.add(other_doc_id) - unvisited.discard(other_doc_id) - if len(current_set) > 0: - set_list.append(current_set) - index += 1 - if index % LONG_BUCKET_PRINT == 0: - self.logger.info(f"processing very long {bucket_len} bucket, {index} documents so far") - if index > LONG_BUCKET_PRINT: - self.logger.info(f"done processing very long {bucket_len}") - - # process created sets - for current_set in set_list: - for d in current_set: - bucket.remove(d) - removed.update(current_set) - for i, doc_id in enumerate(current_set): - if i == 0: - cluster_id = doc_id - remaining = doc_id - min_len = hashes[doc_id][0] - max_len = min_len - continue - c_len = hashes[doc_id][0] - if c_len > max_len: - max_len = c_len - remaining = doc_id - continue - if c_len <= min_len: - min_len = c_len - cluster_id = doc_id - docs[remaining] = cluster_id - removed.discard(remaining) - - # if we did not find docs in connections, submit them as NO_SIMILARITY - for d in bucket: - if d not in docs: - docs[d] = NO_SIMILARITY - if very_long: - self.logger.info( - f"Processed long ({bucket_len}) bucket in {round((time.time() - start) / 60.,3)} " - f"min; " - f"docs chains {len(set_list)}" - ) - self.bucket_processed_counter.inc(1) - # Submit docs - self._submit_generated_docs(docs, removed) - # peg stats - self.stats.add_stats.remote({"generated doc_ids": len(docs), "bucket processing time": time.time() - t_start}) - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessorInvoker(object): - """ - Bucket hash processing coordinator (singleton) - """ - - def __init__(self, bucket_processors: list[ActorHandle]) -> None: - self.n_processors = len(bucket_processors) - self.pool = ActorPool(bucket_processors) - self.submitted = 0 - self.processed = 0 - self.logger = get_logger(__name__) - self.start = time.time() - - def submit_for_processing(self, buckets: list[Union[int, list[int]]]) -> None: - # Get completed results - if self.submitted < self.n_processors: # still have room - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.logger.debug("Submitted bucket processing request") - self.submitted += 1 - return - else: - while True: - # we can have several workers fail here - try: - self.pool.get_next_unordered() - break - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") - self.logger.debug("Completed bucket processing request") - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.submitted += 1 - self.logger.debug("Submitted bucket processing request") - return - - def wait_for_completion(self) -> None: - self.logger.info(f"Waiting bucket processing completion. Submitted requests {self.submitted}") - while self.pool.has_next(): - try: - self.pool.get_next_unordered() - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py index 6c6c02bb3..be1bf5fcb 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py @@ -10,794 +10,67 @@ # limitations under the License. ################################################################################ -import random -import time -from argparse import ArgumentParser, Namespace -from typing import Any - -import mmh3 -import numpy as np -import pyarrow as pa -import ray -from data_processing.data_access import DataAccessFactoryBase, SnapshotUtils -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import ( - RANDOM_SEED, - CLIArgumentProvider, - TransformUtils, - str2bool, -) -from data_processing_ray.runtime.ray import ( - DefaultRayTransformRuntime, - RayTransformFileProcessor, - RayTransformLauncher, - RayUtils, +import argparse +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from fdedup_transform_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, ) -from data_processing_ray.runtime.ray.runtime_configuration import ( - RayTransformRuntimeConfiguration, +from get_duplicate_list_transform_ray import ( + GetDuplicateListRayRuntime, + GetDuplicateListRayTransformConfiguration, ) -from fdedup_support import ( - REQUEST_LEN, - BucketsHash, - BucketsHashProcessor, - BucketsHashProcessorInvoker, - DocCollector, - DocsMinHash, - MurmurMH, - fuzzy_optimal_param, -) -from ray.actor import ActorHandle -from ray.util import ActorPool - - -short_name = "fdedup" -cli_prefix = f"{short_name}_" - - -class FdedupTransform(AbstractTableTransform): - """ - Implements fuzzy dedup data preprocessor (building tables and minhashes). - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - :param config: initialization parameters, with the following keys - doc_column - name of doc column - doc_id_int_column - name of int doc id column - word_shingle_size - word shingle size - mn_min_hash - MurmurMH class - num_bands - number of bands - length_band band length - remote_buckets - bucket actors - remote_minhashes - minhash actors - delimiter - delimiter - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.word_shingle_size = config.get("word_shingle_size", 1) - self.delimiter = config.get("delimiter", " ") - self.mn_min_hash = config.get("mn_min_hash", None) - self.num_bands = config.get("num_bands", 1) - self.length_band = config.get("length_band", 1) - self.buckets = config.get("remote_buckets", []) - self.minhashes = config.get("remote_minhashes", []) - self.random_delay_limit = config.get("random_delay_limit", 10) - - def _generate_minhashes(self, shingles: list[str]) -> np.array: - """ - Generate minhashes - :param shingles: - :return: generated minhashes - """ - min_hashes = self.mn_min_hash.minhash(len(shingles), shingles) - num_min_hashes = len(min_hashes) - assert self.num_bands * self.length_band <= num_min_hashes, ( - f"num_bans*band_len must be <= num min hashes, was num_bands={self.num_bands}, " - f"bands_len={self.length_band}, num_min hashes={num_min_hashes}" - ) - return min_hashes - - def _generate_buckets(self, min_hashes: np.array) -> list[int]: - """ - Generate buckets - :param min_hashes: array of minhashes - :return: - """ - return [ - mmh3.hash64(min_hashes[i * self.length_band : (i + 1) * self.length_band], seed=RANDOM_SEED, signed=False)[ - 0 - ] - for i in range(self.num_bands) - ] - - def _submit_buckets_minhashes( - self, buckets: dict[int, list[int]], minhashes: list[tuple[int, int, np.array]] - ) -> None: - """ - Submit buckets to hash - :param buckets: buckets - :param minhashes: minhashes - :return: None - """ - # bucket requests - request = [[] for _ in range(len(self.buckets))] - for key, value in buckets.items(): - request[key % len(self.buckets)].append((key, value)) - # Submit requests to appropriate bucket collectors - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.buckets[i].add_buckets.remote(req)) - i += 1 - # Minhashes - request = [[] for _ in range(len(self.minhashes))] - for minh in minhashes: - request[minh[0] % len(self.minhashes)].append(minh) - # Submit requests to appropriate minhash collectors - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.minhashes[i].add_minhashes.remote(req)) - i += 1 - # wait for completion - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - Preprocessing table content. - :param table: table - :param file_name - name of currently processed file - :return: resulting table, statistics - """ - from compute_shingles import compute_shingles - - def flush(limit: int) -> None: - """ - flushing buckets and minhashes to dedicated actors - :param limit: number of buckets to flush - :return: None - """ - if len(buckets) >= limit: # time to submit - nonlocal num_buckets - nonlocal num_minhashes - self._submit_buckets_minhashes(buckets, minhashes) - num_buckets = num_buckets + len(buckets) - num_minhashes = num_minhashes + len(minhashes) - buckets.clear() - minhashes.clear() - - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # Inner variables - buckets = {} - minhashes = [] - num_buckets = 0 - num_minhashes = 0 - docs = table[self.doc_column] - doc_ids = table[self.doc_id_column] - # for every document/its integer id - for n in range(table.num_rows): - doc = docs[n].as_py() - doc_id = doc_ids[n].as_py() - shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter) - if len(shingles) > 0: - mh = self._generate_minhashes(shingles) - minhashes.append((doc_id, len(doc), mh)) - candidates = self._generate_buckets(mh) - - for b_hash in candidates: - bucket_array = buckets.get(b_hash) - if bucket_array is None: - buckets[b_hash] = [doc_id] - else: - bucket_array.append(doc_id) - flush(REQUEST_LEN) - flush(0) - # peg stats - stats = {"generated buckets": num_buckets, "generated minhashes": num_minhashes} - time.sleep(int(random.random() * self.random_delay_limit)) - return [], stats - - -class FdedupFilter(AbstractTableTransform): - """ - Filtering documents - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - The dictionary should contain the following: - doc_column - name of doc column - doc_id_int_column - name of int doc id column - cluster_column - name of the cluster column - remote_docs - list of remote doc collectors - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.cluster_column = config.get("cluster_column", "") - self.docs = config.get("remote_docs", "") - self.random_delay_limit = config.get("random_delay_limit", 10) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - De duping (filtering) table content. - :param table: table - :param file_name: name of the currently processing file - :return: resulting table, statistics - """ - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # inner variables - ids = table.column(self.doc_id_column) - # Submit requests to an appropriate doc collectors - request = [[] for _ in range(len(self.docs))] - for value in ids: - doc_id = value.as_py() - request[doc_id % len(self.docs)].append(doc_id) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.docs[i].filter.remote(req)) - i += 1 - # Process replies - unique = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - unique.update(reply) - remote_replies = not_ready - # Filter out table - mask = [] - clusters = [] - # Actual filtering - for n in range(table.num_rows): - doc_id = ids[n].as_py() - if doc_id in unique: - mask.append(True) - clusters.append(unique.pop(doc_id)) - else: - mask.append(False) - # build out table - out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters) - # build execution statistics - stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows} - time.sleep(int(random.random() * self.random_delay_limit)) - return [out_table], stats - - -class FdedupRuntime(DefaultRayTransformRuntime): - """ - Fuzzy dedup runtime support. Here we are using set environment to implement first two steps of fuzzy dedup - processing - preprocessing and bucket hash processing - """ - - def __init__(self, params: dict[str, Any]): - """ - Create filter runtime - :param params: parameters, that should include - doc_column - name of the document column - id_column - name of the integer doc id column - cluster_column - name of the cluster column - worker_options - start options for preprocessor - from the orchestrator configuration - bucket_cpu - number of cpus for bucket actor - doc_cpu - number of cpus for doc actor - mhash_cpu - number of cpus for minhash actor - num_doc_actors - number of document actors - num_bucket_actors - number of bucket actors - num_minhash_actors - number of minhash actors - num_preprocessors - number of preprocessors - snapshot_delay - delay (sec) in sending snapshot requests to actors - use_bucket_snapshot - use bucket snapshot - use_doc_snapshot - use doc snapshot - random_delay_limit - random_delay limit - # fuzzy specific parameters - num_permutations - number of permutations - threshold - threshold - world_shingle_size - word shingles size - delimiters - delimiter - """ - from data_processing.utils import get_logger - - super().__init__(params) - self.logger = get_logger(__name__) - self.sum_buckets = 0 - self.sum_buckets_mem = 0 - self.sum_mh = 0 - self.sum_mh_mem = 0 - self.document_collectors = [] - self.snapshot_delay = self.params.get("snapshot_delay", 1) - self.random_delay_limit = self.params.get("random_delay_limit", 10) - - def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> dict[str, Any]: - """ - Set environment for filter execution - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: dictionary of filter init params - """ - if self.params.get("use_doc_snapshot", False): - self.logger.info("continuing from the document actors snapshot") - data_access = data_access_factory.create_data_access() - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}docs" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.info(f"Found the following snapshot files {files.keys()}") - self.document_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - self.document_collectors[i] = DocCollector.options( - **{"num_cpus": self.params.get("doc_cpu", 0.5)} - ).remote({"id": i, "data_access": data_access_factory, "snapshot": file}) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(self.document_collectors)} document collectors to continue processing") - else: - self.logger.info("starting run from the beginning") - self._create_doc_actors(data_access_factory=data_access_factory, statistics=statistics, files=files) - return { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "cluster_column": self.params.get("cluster_column", ""), - "remote_docs": self.document_collectors, - "random_delay_limit": self.random_delay_limit, - } - - def _create_doc_actors( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: None - """ - mn_min_hash = MurmurMH(num_perm=self.params.get("num_permutations", 64), seed=RANDOM_SEED) - if self.params.get("use_bucket_snapshot", False): - self.logger.info("continuing from the bucket actors snapshot") - data_access = data_access_factory.create_data_access() - # recreate bucket collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}buckets" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following bucket snapshot files {files.keys()}") - bucket_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(bucket_collectors)} bucket collectors to continue processing") - # recreate minhash collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}minhash" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following minhash snapshot files {files.keys()}") - minhash_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - self.logger.info(f"Created {len(minhash_collectors)} minhash collectors to continue processing") - else: - self.logger.info("continuing from the very beginning") - self._create_doc_actors_internal( - data_access_factory=data_access_factory, statistics=statistics, mn_min_hash=mn_min_hash, files=files - ) - - def _create_doc_actors_internal( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - mn_min_hash: MurmurMH, - files: list[str], - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param mn_min_hash - MurmurMH class - :param files - list of files to process - :return: None - """ - # compute fuzzy dedup parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=self.params.get("threshold", 0.8), - num_perm=self.params.get("num_permutations", 64), - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - self.logger.info(f"Fuzzy: num buckets {num_buckets}, bucket length {length_bucket}") - # Build bucket and minhash collectors - bucket_collectors = [None] * self.params.get("num_bucket_actors", 1) - for i in range(self.params.get("num_bucket_actors", 1)): - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(bucket_collectors)} bucket actors") - minhash_collectors = [None] * self.params.get("num_minhash_actors", 1) - for i in range(self.params.get("num_minhash_actors", 1)): - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(minhash_collectors)} minhash actors") - self._preprocess_tables( - data_access_factory=data_access_factory, - statistics=statistics, - files=files, - mn_min_hash=mn_min_hash, - num_buckets=num_buckets, - length_bucket=length_bucket, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - random_delay_limit=self.random_delay_limit, - ) - # At this point we can snapshot both bucket and minhash collectors for potential restart - self.logger.info("creating minhash snapshots") - minhash_replies = [None] * len(minhash_collectors) - index = 0 - for collector in minhash_collectors: - minhash_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while minhash_replies: - ready, not_ready = ray.wait(minhash_replies) - minhash_replies = not_ready - self.logger.info("minhash snapshots created") - self.logger.info("creating bucket snapshots") - bucket_replies = [None] * len(bucket_collectors) - index = 0 - for collector in bucket_collectors: - bucket_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while bucket_replies: - ready, not_ready = ray.wait(bucket_replies) - bucket_replies = not_ready - self.logger.info("bucket snapshots created") - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - - def _process_buckets( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - mn_min_hash: MurmurMH, - ) -> None: - """ - Process buckets - :param data_access_factory - data access factory - :param statistics - statistics actor - :param bucket_collectors - bucket collectors - :param minhash_collectors - minhash collectors - :param mn_min_hash - MMurmurMH class - :return: None - """ - # Create document collectors - self.document_collectors = [None] * self.params.get("num_doc_actors", 1) - for i in range(self.params.get("num_doc_actors", 1)): - self.document_collectors[i] = DocCollector.options(**{"num_cpus": self.params.get("doc_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(self.document_collectors)} document actors") - # create bucket processors - bucket_processors_list = RayUtils.create_actors( - clazz=BucketsHashProcessor, - params={ - "remote_docs": self.document_collectors, - "remote_minhashes": minhash_collectors, - "mn_min_hash": mn_min_hash, - "threshold": self.params.get("threshold", 0.8) * self.params.get("num_permutations", 64), - "statistics": statistics, - }, - actor_options=self.params.get("worker_options", None), - n_actors=self.params.get("num_preprocessors", 1), - ) - self.logger.info(f"created {len(bucket_processors_list)} bucket processor actors") - # create bucket processors invoker - bucket_processor_invoker = BucketsHashProcessorInvoker.options( - num_cpus=self.params.get("bucket_cpu", 0.5) - ).remote(bucket_processors=bucket_processors_list) - self.logger.info(f"created bucket processor invoker") - # Add invoker to the buckets - bucket_replies = [ - collector.add_processing_submitter.remote(submitter=bucket_processor_invoker) - for collector in bucket_collectors - ] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - self.logger.info(f"added invoker to bucket collectors") - # start bucket processing and wait for completion - start = time.time() - bucket_replies = [collector.process_buckets.remote() for collector in bucket_collectors] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - # Wait for pool to complete - ray.get(bucket_processor_invoker.wait_for_completion.remote()) - self.logger.info(f"Done processing buckets in {round((time.time() - start) / 60.,3)} min") - # At this point we can save doc actors, in case we would want to restart here - self.logger.info(f"creating document snapshots") - doc_replies = [None] * len(self.document_collectors) - index = 0 - for collector in self.document_collectors: - doc_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while doc_replies: - ready, not_ready = ray.wait(doc_replies) - doc_replies = not_ready - self.logger.info(f"document snapshots created") - # At this point we do not need bucket and minhash actors, remove them - # but first get usage information - # Bucket collector - replies = [collector.get_size.remote() for collector in bucket_collectors] - while replies: - ready, not_ready = ray.wait(replies) - b_amount, b_memory = ray.get(ready)[0] - self.sum_buckets += b_amount - self.sum_buckets_mem += b_memory - replies = not_ready - for collector in bucket_collectors: - ray.kill(actor=collector, no_restart=True) - # minhash collector - replies = [collector.get_size.remote() for collector in minhash_collectors] - while replies: - ready, not_ready = ray.wait(replies) - m_amount, m_memory = ray.get(ready)[0] - self.sum_mh += m_amount - self.sum_mh_mem += m_memory - replies = not_ready - for collector in minhash_collectors: - ray.kill(actor=collector, no_restart=True) - # Clean up processors - for processor in bucket_processors_list: - ray.kill(actor=processor, no_restart=True) - ray.kill(bucket_processor_invoker) - - def _preprocess_tables( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - files: list[str], - mn_min_hash: MurmurMH, - num_buckets: int, - length_bucket: int, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - random_delay_limit: int, - ) -> None: - """ - Preprocess tables - build, run and cleanup - :param data_access_factory - data access factory - :param statistics - statistics actor - :param files - list of files to process - :param mn_min_hash - MurmurMH class - :param num_buckets - number of buckets - :param length_bucket - bucket length - :param bucket_collectors - bucket collector actors - :param minhash_collectors - minhash_collector actors - :param random_delay_limit - max for random dalay limit - :return: None - """ - from ray.util.metrics import Gauge - - worker_options = self.params.get("worker_options", None) - # Here we are limiting the number of readers not to overwhelm COS - n_readers = self.params.get("num_preprocessors", 1) - if n_readers > 1000: - n_readers = 1000 - self.logger.info(f"Table preprocessing uses {n_readers} readers") - # Create preprocessing actors - processor_params = { - "data_access_factory": data_access_factory, - "transform_class": FdedupTransform, - "statistics": statistics, - "transform_params": { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "word_shingle_size": self.params.get("world_shingle_size", 1), - "mn_min_hash": mn_min_hash, - "num_bands": num_buckets, - "length_band": length_bucket, - "remote_buckets": bucket_collectors, - "remote_minhashes": minhash_collectors, - "delimiter": self.params.get("delimiter", " "), - "random_delay_limit": random_delay_limit, - }, - "base_table_stats": False, - } - processors_list = RayUtils.create_actors( - clazz=RayTransformFileProcessor, - params=processor_params, - actor_options=worker_options, - n_actors=n_readers, - ) - self.logger.info(f"created {len(processors_list)} table processor actors") - # Execute preprocessing - # create gauges - files_in_progress_gauge = Gauge( - "preprocessing_files_in_progress", "Number of files in progress, preprocessing" - ) - files_completed_gauge = Gauge( - "preprocessing_files_processed_total", "Number of files completed, preprocessing" - ) - available_cpus_gauge = Gauge("preprocessing_available_cpus", "Number of available CPUs, preprocessing") - available_gpus_gauge = Gauge("preprocessing_available_gpus", "Number of available GPUs, preprocessing") - available_memory_gauge = Gauge("preprocessing_available_memory", "Available memory, preprocessing") - available_object_memory_gauge = Gauge( - "preprocessing_available_object_store", "Available object store, preprocessing" - ) - print_interval = int(len(files) / 100) - if print_interval == 0: - print_interval = 1 - # process data - processors = ActorPool(processors_list) - failures = RayUtils.process_files( - executors=processors, - files=files, - print_interval=print_interval, - files_in_progress_gauge=files_in_progress_gauge, - files_completed_gauge=files_completed_gauge, - available_cpus_gauge=available_cpus_gauge, - available_gpus_gauge=available_gpus_gauge, - available_memory_gauge=available_memory_gauge, - object_memory_gauge=available_object_memory_gauge, - logger=self.logger, - ) - if failures > 0: - statistics.add_stats.remote({"actor failures": failures}) - # Clean up processors - for processor in processors_list: - ray.kill(actor=processor, no_restart=True) - del processors - - def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: - """ - Compute execution statistics - :param stats: output of statistics - :return: job execution statistics - """ - # Get document collector statistics - sum_docs = 0 - sum_docs_mem = 0 - sum_removed = 0 - sum_removed_mem = 0 - replies = [collector.get_size.remote() for collector in self.document_collectors] - while replies: - ready, not_ready = ray.wait(replies) - d_amount, d_memory, r_amount, r_memory = ray.get(ready)[0] - sum_docs += d_amount - sum_docs_mem += d_memory - sum_removed += r_amount - sum_removed_mem += r_memory - replies = not_ready - overall_hash_memory = self.sum_buckets_mem + self.sum_mh_mem + sum_docs_mem + sum_docs_mem + sum_removed_mem - dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1)) - return { - "number of buckets": self.sum_buckets, - "number of docs": sum_docs, - "number of removed docs": sum_removed, - "number of min hashes": self.sum_mh, - "overall hash memory GB": overall_hash_memory, - "de duplication %": dedup_prst, - } | stats - +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration -class FdedupTableTransformConfiguration(TransformConfiguration): - """ - Provides support for configuring and using the associated Transform class include - configuration with CLI args and combining of metadata. - """ - def __init__(self): - super().__init__( - name=short_name, - transform_class=FdedupFilter, - ) - from data_processing.utils import get_logger +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} - self.logger = get_logger(__name__) - def add_input_params(self, parser: ArgumentParser) -> None: - """ - Add Transform-specific arguments to the given parser. - """ - parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="document column name") - parser.add_argument( - f"--{cli_prefix}id_column", type=str, default="int_document_id", help="integer document id column name" - ) - parser.add_argument(f"--{cli_prefix}cluster_column", type=str, default="cluster", help="cluster column name") - parser.add_argument( - f"--{cli_prefix}bucket_cpu", type=float, default=0.5, help="number of CPUs per bucket hash" - ) - parser.add_argument( - f"--{cli_prefix}mhash_cpu", type=float, default=0.5, help="number of CPUs per minhash hash" - ) - parser.add_argument(f"--{cli_prefix}doc_cpu", type=float, default=0.5, help="number of CPUs per doc hash") - parser.add_argument(f"--{cli_prefix}num_doc_actors", type=int, default=1, help="number of doc actors to use") - parser.add_argument( - f"--{cli_prefix}num_minhash_actors", type=int, default=1, help="number of minhash actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_bucket_actors", type=int, default=1, help="number of bucket actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_preprocessors", type=int, default=1, help="number of preprocessors to use" - ) - parser.add_argument(f"--{cli_prefix}num_permutations", type=int, default=64, help="number of permutations") - parser.add_argument(f"--{cli_prefix}threshold", type=float, default=0.8, help="threshold") - parser.add_argument(f"--{cli_prefix}shingles_size", type=int, default=5, help="number of words in shingle") - parser.add_argument( - f"--{cli_prefix}delimiters", type=str, default=" ", help="delimiter for splitting document" - ) - parser.add_argument(f"--{cli_prefix}snapshot_delay", type=int, default=1, help="snapshot delay time") - parser.add_argument( - f"--{cli_prefix}use_bucket_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with bucket snapshot", - ) - parser.add_argument( - f"--{cli_prefix}use_doc_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with doc snapshot", - ) - parser.add_argument( - f"--{cli_prefix}random_delay_limit", type=int, default=10, help="maximum delay between read" - ) +ray_worker_options = {"num_cpus": 0.8} +ray_params = { + # where to run + "run_locally": True, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(ray_worker_options), + "runtime_num_workers": 3, +} - def apply_input_params(self, args: Namespace) -> bool: - """ - Validate and apply the arguments that have been parsed - :param args: user defined arguments. - :return: True, if validate pass or False otherwise - """ - captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) - self.params = self.params | captured - self.params["worker_options"] = args.runtime_worker_options - if self.params["use_bucket_snapshot"] and self.params["use_doc_snapshot"]: - self.logger.warning("both bucket and doc snapshot are specified. Only one allowed") - return False +ray_params_argv = ParamsUtils.dict_to_req(ray_params) - self.logger.info(f"fuzzy dedup params are {self.params}") - return True +class RayServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) -class FdedupRayTransformConfiguration(RayTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=FdedupTableTransformConfiguration(), runtime_class=FdedupRuntime) + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params if service_short_name == "fdlist" else ray_params_argv + params[1:] + if service_short_name == "minhash": + launcher = RayTransformLauncher(runtime_config=SignatureCalculationRayTransformConfiguration()) + elif service_short_name == "cluster": + launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = RayTransformLauncher(runtime_config=GetDuplicateListRayTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + status = launcher.launch() + return status if __name__ == "__main__": - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - launcher.launch() + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = RayServiceOrchestrator(global_params=args) + # Launch ray fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..40081e658 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py @@ -0,0 +1,69 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, + RayTransformRuntimeConfiguration, +) +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListRayRuntime(DefaultRayTransformRuntime): + """ + Get duplicate list runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Fuzzy Dedup Get Duplicate List + as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListRayRuntime, + ) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list ray transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py new file mode 100644 index 000000000..cb87b56af --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py @@ -0,0 +1,54 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # execution info + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py new file mode 100644 index 000000000..678d953f2 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 000000000..f5da05a10 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet new file mode 100644 index 000000000..0e089dee3 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 000000000..4b0fecb15 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 000000000..5601f5cb0 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 000000000..02bedff1c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 000000000..bf131f43c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet new file mode 100644 index 000000000..d41b35de2 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 000000000..2838dd972 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 000000000..7cb2cbac4 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 000000000..9de625746 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 000000000..8e1fe121e Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 000000000..37aea5168 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 000000000..3d1f158e9 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json new file mode 100644 index 000000000..c08326355 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "cluster", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "jaccard_similarity_threshold": 0.7, + "num_bands": 14, + "num_segments": 2, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 91.7, + "gpus": 0, + "memory": 24.01, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 28, + "result_size": 38040, + "processing_time": 0.061, + "input_files": 28, + "input_bytes": 115324, + "input_rows": 168, + "consolidated_files": 28, + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet new file mode 100644 index 000000000..03a0c321a Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json new file mode 100644 index 000000000..717d9bbe9 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:10:22", + "end_time": "2024-10-18 10:10:23", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.7, + "gpus": 0, + "memory": 24.17, + "object_store": 0, + "execution time, min": 0.005 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.308, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..8aa870c00 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..34b15a76c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/metadata.json index 4a1b54395..a0b26f931 100644 --- a/transforms/universal/fdedup/ray/test-data/expected/metadata.json +++ b/transforms/universal/fdedup/ray/test-data/expected/metadata.json @@ -2,86 +2,48 @@ "pipeline": "pipeline_id", "job details": { "job category": "preprocessing", - "job name": "fdedup", - "job type": "ray", + "job name": "fdlist", + "job type": "pure python", "job id": "job_id", - "start_time": "2024-06-24 19:39:44", - "end_time": "2024-06-24 19:39:57", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", "status": "success" }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, + "code": null, "job_input_params": { - "doc_column": "contents", - "id_column": "int_id_column", - "cluster_column": "cluster", - "bucket_cpu": 0.5, - "mhash_cpu": 0.5, - "doc_cpu": 0.5, - "num_doc_actors": 1, - "num_minhash_actors": 1, - "num_bucket_actors": 1, - "num_preprocessors": 2, - "num_permutations": 64, - "threshold": 0.8, - "shingles_size": 5, - "delimiters": " ", - "snapshot_delay": 1, - "use_bucket_snapshot": false, - "use_doc_snapshot": false, - "random_delay_limit": 5, - "worker_options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, "checkpointing": false, "max_files": -1, "random_samples": -1, "files_to_use": [".parquet"], - "number of workers": 1, - "worker options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, - "actor creation delay": 0 + "num_processors": 0 }, "execution_stats": { - "cpus": 16, + "cpus": 4.5, "gpus": 0, - "memory": 14.396823502145708, - "object_store": 2.0, - "execution time, min": 0.22008283535639445 + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 }, "job_output_stats": { - "number of buckets": 15, - "number of docs": 3, - "number of removed docs": 2, - "number of min hashes": 5, - "overall hash memory GB": 7.152557373046875e-6, - "de duplication %": 40.0, - "source_files": 2, - "source_size": 73126, - "generated buckets": 15, - "generated minhashes": 5, - "source_doc_count": 10, - "generated doc_ids": 3, - "bucket processing time": 0.04204988479614258, "result_files": 1, - "result_size": 36941, - "processing_time": 2.286285161972046, - "source_documents": 5, - "result_documents": 3, - "result_doc_count": 3 + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 }, "source": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/test-data/input", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", "type": "path" }, "target": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/output", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", "type": "path" } } diff --git a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet b/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet deleted file mode 100644 index 92b4e58c7..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 000000000..c7d3d8072 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 000000000..c355b299a Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 000000000..ad59ee31c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 000000000..fb2a0b13d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet new file mode 100644 index 000000000..aca2026d8 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 000000000..1a46cb40f Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 000000000..56934cab8 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet new file mode 100644 index 000000000..f82d9daca Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 000000000..842ce2caa Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet new file mode 100644 index 000000000..fcb03c17a Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 000000000..84c399e67 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 000000000..79a6f24b3 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 000000000..e67164596 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet new file mode 100644 index 000000000..cd2e75eaa Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet new file mode 100644 index 000000000..5212dff6d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 000000000..d0f1bd9b4 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet new file mode 100644 index 000000000..1cc7b2c26 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 000000000..f892d384d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 000000000..1a786300b Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 000000000..bc20a7699 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 000000000..151008dc4 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 000000000..b485d3882 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 000000000..0da33db3c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 000000000..1e1b4765c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 000000000..7e9af93b0 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 000000000..d112e179e Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 000000000..f3f7d2a7d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet new file mode 100644 index 000000000..06444accf Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 deleted file mode 100644 index c92d73bfb..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 deleted file mode 100644 index c3966bec2..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 deleted file mode 100644 index e419c9516..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/input/df1.parquet b/transforms/universal/fdedup/ray/test-data/input/df1.parquet new file mode 100644 index 000000000..2584725bb Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/input/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet b/transforms/universal/fdedup/ray/test-data/input/sample1.parquet deleted file mode 100644 index 58387d07d..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py new file mode 100644 index 000000000..a3771fbd8 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import ( + jaccard_similarity_threshold_cli_param, + num_bands_cli_param, + num_segments_cli_param, + sort_output_cli_param, +) +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + jaccard_similarity_threshold_cli_param: 0.7, + sort_output_cli_param: True, + } + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py new file mode 100644 index 000000000..a62105b2c --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py @@ -0,0 +1,61 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "get_list_transform", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + "run_locally": True, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_fdedup.py b/transforms/universal/fdedup/ray/test/test_fdedup.py deleted file mode 100644 index fa46fb071..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup.py +++ /dev/null @@ -1,18 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -# There is no local test for fdedup -# This is just a place holder t satisfy overall framework - - -def test_fdedup(): - pass diff --git a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py b/transforms/universal/fdedup/ray/test/test_fdedup_ray.py deleted file mode 100644 index 78ee7cc04..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py +++ /dev/null @@ -1,60 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -class TestRayFdedupTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) - config = { - "run_locally": True, - # When running in ray, our Runtime's get_transform_config() method will load the domains using - # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 1, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, - } - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")] - return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..55869598c --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + sort_output_cli_param: True, + } + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py new file mode 100644 index 000000000..34f3ee403 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform import ( + num_bands_cli_param, + num_permutations_cli_param, + num_segments_cli_param, +) +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_permutations_cli_param: 112, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + } + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile new file mode 100644 index 000000000..b04994d46 --- /dev/null +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -0,0 +1,51 @@ +ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 +FROM ${BASE_IMAGE} + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME + +WORKDIR ${SPARK_HOME}/work-dir + +# Copy in the data processing framework source/project and install it +# This is expected to be placed in the docker context before this is run (see the make image). +COPY --chown=spark:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] + +## Copy the python version of the tansform +COPY --chown=spark:root python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +# Install spark project source +COPY --chown=spark:root src/ src/ +COPY --chown=spark:root pyproject.toml pyproject.toml +COPY --chown=spark:root README.md README.md +RUN mkdir -p /opt/spark/work-dir/src/templates && \ + mkdir -p /opt/spark/work-dir/config +COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ + +# install requirements from requirements.txt +COPY requirements.txt . +RUN pip3 install -r requirements.txt + +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/fdedup_transform_spark.py . + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +USER spark + +# Set environment +ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} +ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH} + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile new file mode 100644 index 000000000..ac2735e7d --- /dev/null +++ b/transforms/universal/fdedup/spark/Makefile @@ -0,0 +1,57 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.spark-venv + +test:: .transforms.spark-test + +clean:: .transforms.clean + +image:: .transforms.spark-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-spark + +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.spark-test-image + +run-cli-sample: .transforms.run-cli-spark-sample + +run-local-sample: .transforms.run-local-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md new file mode 100644 index 000000000..3bf9b3245 --- /dev/null +++ b/transforms/universal/fdedup/spark/README.md @@ -0,0 +1,109 @@ +# Spark-GUF + +This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster. + +The modules can run locally or remotely in a Kubernetes cluster. + +## Running Transforms locally + +Start in the `spark-guf` directory. To run the modules locally, follow these steps: +1. Create a virtual environment using this command + ``` + make venv + ``` +2. Activate the virtual environment: + ``` + source venv/bin/activate + ``` + +3. Set the `PYTHONPATH` environment variable to include the `src` directory: + ``` + export PYTHONPATH=${PYTHONPATH}:${PWD}/src + ``` +4. Invoke one of the transforms: + ``` + python src/transforms/spark_pi/spark_transformer_pi.py + ``` +5. To find out which arguments a transform takes, run that transform with a `--help` flag: + ``` + python src/transforms/spark_filter/spark_filter_transform.py --help + usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE] + --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP] + [--filter_logical_operator {AND,OR}] + + optional arguments: + -h, --help show this help message and exit + --input_folder INPUT_FOLDER + path to read the input files (local fs or s3) + --output_folder OUTPUT_FOLDER + path to write the output files (local fs or s3) + --data_type DATA_TYPE + Type of files to filter (parquet, orc, csv, json, txt) + --filter_criteria_list FILTER_CRITERIA_LIST + list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04' + AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ] + --filter_columns_to_drop FILTER_COLUMNS_TO_DROP + list of columns to drop after filtering, for example: ["column1", "column2"] + --filter_logical_operator {AND,OR} + logical operator (AND or OR) that joins filter criteria + ``` + +## Running Transforms in Kubernetes/OpenShift + +Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps: + +1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io: + ``` + docker build -t my-docker-username/my-pyspark:3.5.1 . + docker push my-docker-username/my-pyspark:3.5.1 + ``` +2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image): + ``` + docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 . + docker push my-docker-username/my-pyspark-filter:3.5.1 + ``` + +3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps: + ``` + # create 'spark' service account + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace + + # create 'spark' role + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace + + # bind the 'spark' service account to the 'spark' role + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace + + # bind the 'spark' service account to the cluster roles + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace + ``` + + 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets. + + 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory. The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml): + 1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`). + 2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`) + 3. Specify the command to launch the Spark application (in `spec/containers/args`) + 4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2). + 5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`) + 6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`) + 7. Configure S3: + 1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs. + 2. Specify the input and out access key ids and secret access keys. + +6. Launch the Spark application by creating the driver pod: + ``` + kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml + ``` + +7. Monitor the creation of the executor pods: + ``` + kubectl get pods -w + ``` + +8. Monitor the driver logs: + ``` + kubectl logs spark-driver-pod-filter -f + ``` + ``` diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml new file mode 100644 index 000000000..d9579e0c7 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Pod +metadata: +spec: + imagePullSecrets: + - name: prod-all-icr-io + securityContext: + fsGroup: 0 diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml new file mode 100644 index 000000000..eeddbd694 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml @@ -0,0 +1,14 @@ +spark.app.name: ${APP_NAME} +spark.driver.memory: ${DRIVER_MEMORY} +spark.executor.instances: ${NUM_EXECUTORS} +spark.executor.memory: ${EXECUTOR_MEMORY} +spark.executor.cores: ${EXECUTOR_CORES} +spark.sql.shuffle.partitions: ${NUM_TASKS} +spark.task.cpus: ${TASK_CPUS} +spark.sql.legacy.parquet.nanosAsLong: true +spark.executor.decommission.forceKillTimeout: "10h" +# spark.sql.files.ignoreCorruptFiles: true +# configuration needed when running in kubernetes +spark.kubernetes.authenticate.driver.serviceAccountName: ${SERVICE_ACCOUNT} +spark.kubernetes.container.image: ${EXECUTOR_DOCKER_IMAGE} +spark.kubernetes.namespace: ${EXECUTOR_NAMESPACE} diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml new file mode 100644 index 000000000..f77df2010 --- /dev/null +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "dpk_fdedup_transform_spark" +version = "0.2.2.dev2" +requires-python = ">=3.10,<3.13" +description = "Fuzzy Dedup Spark Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, +] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt new file mode 100644 index 000000000..c373ffbb7 --- /dev/null +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -0,0 +1,11 @@ +dpk_fdedup_transform_python==0.2.2.dev2 +data-prep-toolkit[spark]==0.2.2.dev2 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 +scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py new file mode 100644 index 000000000..c9950657c --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py new file mode 100644 index 000000000..feeb3241e --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -0,0 +1,75 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisSparkRuntime(DefaultSparkTransformRuntime): + """ + Cluster analysis runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + +class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisSparkRuntime, + ) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis spark transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py new file mode 100644 index 000000000..eb1e61845 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py @@ -0,0 +1,61 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py new file mode 100644 index 000000000..2ff0df8bf --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -0,0 +1,124 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class DataCleaningSparkRuntime(DefaultSparkTransformRuntime): + """ + Data cleaning runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = dc_data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Data Cleaning + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningSparkRuntime, + ) + + def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :return: dictionary of parameters to be broadcast + """ + data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"]) + ) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return {"df": self.duplicate_list} + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py new file mode 100644 index 000000000..82767f849 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py @@ -0,0 +1,62 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import argparse +import os +import sys + +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing_spark.runtime.spark import SparkTransformLauncher +from fdedup_transform_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + + +class SparkServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + elif service_short_name == "cluster": + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + status = launcher.launch() + return status + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = SparkServiceOrchestrator(global_params=args) + # Launch spark fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/src/requirements.txt new file mode 100644 index 000000000..c1a1f2c3d --- /dev/null +++ b/transforms/universal/fdedup/spark/src/requirements.txt @@ -0,0 +1,8 @@ +pyspark +pyarrow +pyyaml +boto3 +kubernetes +disjoint_set +mmh3 +scipy diff --git a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py new file mode 100644 index 000000000..2db884346 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py new file mode 100644 index 000000000..4e39810c6 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Signature Calculation + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 000000000..f5da05a10 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet new file mode 100644 index 000000000..0e089dee3 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 000000000..4b0fecb15 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 000000000..5601f5cb0 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 000000000..02bedff1c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 000000000..bf131f43c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet new file mode 100644 index 000000000..d41b35de2 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 000000000..2838dd972 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 000000000..7cb2cbac4 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 000000000..9de625746 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 000000000..8e1fe121e Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 000000000..37aea5168 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 000000000..3d1f158e9 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json new file mode 100644 index 000000000..c08326355 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "cluster", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "jaccard_similarity_threshold": 0.7, + "num_bands": 14, + "num_segments": 2, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 91.7, + "gpus": 0, + "memory": 24.01, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 28, + "result_size": 38040, + "processing_time": 0.061, + "input_files": 28, + "input_bytes": 115324, + "input_rows": 168, + "consolidated_files": 28, + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet new file mode 100644 index 000000000..03a0c321a Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json new file mode 100644 index 000000000..717d9bbe9 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:10:22", + "end_time": "2024-10-18 10:10:23", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.7, + "gpus": 0, + "memory": 24.17, + "object_store": 0, + "execution time, min": 0.005 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.308, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..8aa870c00 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..34b15a76c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/metadata.json new file mode 100644 index 000000000..a0b26f931 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 4.5, + "gpus": 0, + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 000000000..c7d3d8072 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 000000000..c355b299a Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 000000000..ad59ee31c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 000000000..fb2a0b13d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet new file mode 100644 index 000000000..aca2026d8 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 000000000..1a46cb40f Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 000000000..56934cab8 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet new file mode 100644 index 000000000..f82d9daca Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 000000000..842ce2caa Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet new file mode 100644 index 000000000..fcb03c17a Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 000000000..84c399e67 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 000000000..79a6f24b3 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 000000000..e67164596 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet new file mode 100644 index 000000000..cd2e75eaa Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet new file mode 100644 index 000000000..5212dff6d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 000000000..d0f1bd9b4 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet new file mode 100644 index 000000000..1cc7b2c26 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 000000000..f892d384d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 000000000..1a786300b Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 000000000..bc20a7699 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 000000000..151008dc4 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 000000000..b485d3882 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 000000000..0da33db3c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 000000000..1e1b4765c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 000000000..7e9af93b0 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 000000000..d112e179e Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 000000000..f3f7d2a7d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet new file mode 100644 index 000000000..06444accf Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/input/df1.parquet b/transforms/universal/fdedup/spark/test-data/input/df1.parquet new file mode 100644 index 000000000..2584725bb Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/input/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py new file mode 100644 index 000000000..294c86f25 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import sort_output_cli_param +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, + } + launcher = SparkTransformLauncher(ClusterAnalysisSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py new file mode 100644 index 000000000..919857e23 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py @@ -0,0 +1,58 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = SparkTransformLauncher(DataCleaningSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py new file mode 100644 index 000000000..6d93dc7a9 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + } + launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config index 774716e15..ffaeb9f45 100644 --- a/transforms/universal/fdedup/transform.config +++ b/transforms/universal/fdedup/transform.config @@ -14,5 +14,6 @@ TRANSFORM_NAME=fdedup # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -FDEDUP_RAY_VERSION=$(DPK_VERSION) - +FDEDUP_PYTHON_VERSION=$(DPK_VERSION) +FDEDUP_RAY_VERSION=$(FDEDUP_PYTHON_VERSION) +FDEDUP_SPARK_VERSION=$(FDEDUP_PYTHON_VERSION) diff --git a/transforms/universal/fdedup/utils/Makefile.local b/transforms/universal/fdedup/utils/Makefile.local new file mode 100644 index 000000000..d9dae01d7 --- /dev/null +++ b/transforms/universal/fdedup/utils/Makefile.local @@ -0,0 +1,18 @@ +PYTHON=python +PIP=pip + +venv: requirements.txt + $(PYTHON) -m venv venv + if [ -e venv/Scripts/activate ]; then \ + echo "For Windows please try the following AS Administrator - no guarantees"; \ + echo " venv\\Scripts\\activate"; \ + echo " pip install --upgrade pip"; \ + echo " pip install -r requirements.txt"; \ + echo " pip install pytest"; \ + else \ + . venv/bin/activate; \ + $(PIP) install --upgrade pip; \ + $(PIP) install -r requirements.txt; \ + fi +set-versions: + @: \ No newline at end of file diff --git a/transforms/universal/fdedup/utils/calc_r_and_b.ipynb b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb new file mode 100644 index 000000000..8398f9efa --- /dev/null +++ b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb @@ -0,0 +1,74 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf5dba9a-d530-4a0a-ae71-2d741f7e705f", + "metadata": {}, + "source": [ + "This notebook allows calculating the values for `b` (the number of bands) and `r` (the number of minhashes in a band) used in the fuzzy dedup algorithm. The default values are `b=14` and `r=8`, as defined in the [FineWeb datasets paper](https://arxiv.org/pdf/2406.17557). The x-axis of the graph represents the Jaccard similarity between a pair of documents, while the y-axis represents the probability that they become duplication candidates. Please refer to http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf for more details on this methodology." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800bc113-8b5e-4cec-8717-98fa05753bd0", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Define the parameterized function\n", + "def f(s, r, b):\n", + " return 1 - (1 - s**r)**b\n", + "\n", + "# Set the parameters r and b\n", + "r = 8\n", + "b = 14\n", + "\n", + "# Generate values for s in a range, e.g., from 0 to 1\n", + "s_values = np.linspace(0, 1, 500) # 500 points between 0 and 1\n", + "f_values = f(s_values, r, b)\n", + "\n", + "# Plot the function\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(s_values, f_values, label=fr\"$f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\", color='blue')\n", + "plt.xlabel(\"s\")\n", + "plt.ylabel(\"f(s)\")\n", + "plt.title(f\"Plot of the function $f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\")\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98016b04-b6a0-465d-b65b-6d402978c9f0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/utils/requirements.txt b/transforms/universal/fdedup/utils/requirements.txt new file mode 100644 index 000000000..ce2acfefb --- /dev/null +++ b/transforms/universal/fdedup/utils/requirements.txt @@ -0,0 +1,3 @@ +jupyter +numpy +matplotlib