IBM · Kibnelson · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -62,7 +62,7 @@ def orchestrate(
         if is_folder:
             # folder transform
             files = runtime.get_folders(data_access=data_access)
-            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+            logger.info(f"Number of folders is {len(files)}")  # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
             if len(files) == 0:

diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh
@@ -17,7 +17,7 @@ if [ ! -d transforms ]; then
     echo Please run this script from the top of the repository
     exit 1
 fi
-KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering"
+KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering fdedup"
 while [ $# -ne 0 ]; do
    case $1 in
         -show-kfp-black-list)    echo $KFP_BLACK_LIST; exit 0;

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore
@@ -0,0 +1 @@
+venv/
diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
@@ -0,0 +1,43 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
+RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
+
+COPY --chown=dpk:root src/ src/
+COPY --chown=dpk:root pyproject.toml pyproject.toml
+COPY --chown=dpk:root README.md README.md
+#COPY --chown=dpk:root requirements.txt requirements.txt
+
+RUN pip install --no-cache-dir -e .
+
+# copy source data
+COPY src/ src/
+
+# copy source data
+COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py
+COPY ./src/signature_calc_local_python.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile
@@ -0,0 +1,64 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+
+# Set this, before including .make.defaults, to 
+#   1 if requirements reference the latest code in the data processing library 
+#     in this repo (that is not yet published to pypi).	 This is the default setting.
+#   0 if the transforms DPK dependencies are on wheels published to 
+#     pypi (e.g. data-prep-toolkit=0.2.1)
+#USE_REPO_LIB_SRC=1
+
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+include $(REPOROOT)/transforms/.make.transforms
+
+# Include the common configuration for this transform
+include ../transform.config
+
+venv::	.transforms.python-venv
+
+test::	.transforms.python-test
+
+clean:: .transforms.clean
+
+image:: .transforms.python-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions
+
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.python-test-image
+
+run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample: .transforms.run-local-python-sample
+
+#run-s3-ray-sample: .transforms.run-s3-ray-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
@@ -0,0 +1,11 @@
+# Fuzzy Dedup
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+
+The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see
+[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details.
diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
@@ -0,0 +1,55 @@
+[project]
+name = "dpk_fdedup_transform_python"
+version = "0.2.2.dev1"
+requires-python = ">=3.10"
+description = "Fuzzy Dedup Transform for Python"
+license = {text = "Apache-2.0"}
+readme = {file = "README.md", content-type = "text/markdown"}
+authors = [
+    { name = "Nelson Bore", email = "[email protected]" },
+    { name = "Constantin Adam", email = "[email protected]" },
+]
+dependencies = [
+    "data-prep-toolkit==0.2.2.dev1",
+    "pyarrow==16.1.0",
+    "pyyaml>=6.0.2",
+    "boto3>=1.34.69",
+    "kubernetes>=30.1.0",
+    "polars==1.9.0",
+    "disjoint-set>=0.8.0",
+    "scipy>=1.14.1, <2.0.0",
+    "numpy<1.29.0",
+    "sentencepiece>=0.2.0",
+    "mmh3>=4.1.0",
+]
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src","test"]
+
+[options.packages.find]
+where = ["src/"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]
diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py
@@ -0,0 +1,112 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+
+import logging
+import os
+from typing import List, Set
+
+import mmh3
+import numpy as np
+
+
+class Murmur_MH:
+    def __init__(self, num_perm=64, seed=42, hashfunc=None):
+        self.seed = seed
+        self.num_perm = num_perm  # the number of buckets, i.e. the vector length after self.minhash() call
+        self.permutations = self._init_permutations(seed, num_perm)
+
+    def _init_permutations(self, seed, num_perm):
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        max_int = np.uint64((1 << 64) - 1)
+        # initialize pseudo random number generator with given seed value
+        gen = np.random.RandomState(seed)
+        # get self.num_perm pseudo random numbers between 2 and max_int (excl)
+        permutations = np.array(
+            [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)],
+            dtype=np.uint64,
+        ).T
+        # make all even pseudo random numbers odd by adding 1
+        permutations[permutations % 2 == 0] += 1
+        return permutations
+
+    def minhash(self, shingles: List[str]):
+        """return np.array of minhash"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+        )
+
+    def minhash2(self, shingles: List[str], doc_len: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0),
+            doc_len,
+        )
+
+    def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+            .tolist(),
+            doc_len,
+            doc_id,
+        )
+
+    @staticmethod
+    def jaccard(mh1: np.array, mh2: np.array) -> float:
+        """
+        The Jaccard similarity measures the similarity between two sets of data
+        to see which members are shared and distinct.
+
+        The Jaccard similarity is calculated by dividing the number of observations
+        in both sets by the number of observations in either set.
+
+        Developed by Paul Jaccard, the index ranges from 0 to 1.
+        The closer to 1, the more similar the two sets of data.
+
+        As a document is represented by a set. We use Jaccard distance to see how similar between two documents.
+        """
+        assert len(mh1) == len(mh2)
+        return np.count_nonzero(mh1 == mh2) / len(mh1)
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -0,0 +1,49 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "cluster_num_bands": 14,
+    "cluster_num_segments": 2,
+    "cluster_jaccard_similarity_threshold": 0.7,
+}
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    print(sys.argv)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+    # Launch python to process the input
+    launcher.launch()