Merge pull request #5 from x-tabdeveloping/levenshtein

Refine results with Levenshtein distance
x-tabdeveloping · Sep 6, 2024 · 2f3e7ab · 2f3e7ab
2 parents 20dcb51 + c2874a6
commit 2f3e7ab
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,20 @@ Blazing fast, lightweight and customizable fuzzy and semantic text search in Pyt
 Neofuzz is a fuzzy search library based on vectorization and approximate nearest neighbour
 search techniques.
 
+### New in version 0.3.0
+Now you can reorder your search results using Levenshtein distance!
+Sometimes n-gram processes or vectorized processes don't quite order the results correctly.
+In these cases you can retrieve a higher number of examples from the indexed corpus, then refine those results with Levenshtein distance.
+
+```python
+from neofuzz import char_ngram_process
+
+process = char_ngram_process()
+process.index(corpus)
+
+process.extract("your query", limit=30, refine_levenshtein=True)
+```
+
 ### Why is Neofuzz fast?
 Most fuzzy search libraries rely on optimizing the hell out of the same couple of fuzzy search algorithms (Hamming distance, Levenshtein distance). Sometimes unfortunately due to the complexity of these algorithms, no amount of optimization will get you the speed, that you want.
 
@@ -93,7 +107,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
  process = Process(vectorizer, metric="cosine")
 ```
 
-### Dimentionality Reduction
+### Dimensionality Reduction
 
 You might find that the speed of your fuzzy search process is not sufficient. In this case it might be desirable to reduce the dimentionality of the produced vectors with some matrix decomposition method or topic model.
 
@@ -107,7 +121,7 @@ from sklearn.pipeline import make_pipeline
 
 # Vectorization with tokens again
 vectorizer = TfidfVectorizer()
-# Dimentionality reduction method to 20 dimentions
+# Dimensionality reduction method to 20 dimensions
 nmf = NMF(n_components=20)
 # Create a pipeline of the two
 pipeline = make_pipeline(vectorizer, nmf)

diff --git a/neofuzz/process.py b/neofuzz/process.py
@@ -7,6 +7,7 @@
 import pynndescent
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.metrics import pairwise_distances
+from thefuzz import process as thefuzz_process
 
 
 class Process:
@@ -20,6 +21,8 @@ class Process:
         Some kind of vectorizer model that can vectorize strings.
         You could use tf-idf, bow or even a Pipeline that
         has multiple steps.
+    refine_levenshtein: bool, default False
+        Indicates whether final results should be refined with the Levenshtein algorithm
     metric: string or callable, default 'cosine'
         The metric to use for computing nearest neighbors. If a callable is
         used it must be a numba njit compiled function. Supported metrics
@@ -143,6 +146,7 @@ class Process:
     def __init__(
         self,
         vectorizer,
+        refine_levenshtein=False,
         metric="cosine",
         metric_kwds=None,
         n_neighbors=30,
@@ -165,6 +169,7 @@ def __init__(
         verbose=False,
     ):
         self.vectorizer = vectorizer
+        self.refine_levenshtein = refine_levenshtein
         self.nearest_neighbours_kwargs = {
             "metric": metric,
             "metric_kwds": metric_kwds,
@@ -213,7 +218,10 @@ def index(self, options: Iterable[str]):
         self.nearest_neighbours.prepare()
 
     def query(
-        self, search_terms: Iterable[str], limit: int = 10
+        self,
+        search_terms: Iterable[str],
+        limit: int = 10,
+        refine_levenshtein: Optional[bool] = None,
     ) -> Tuple[np.ndarray, np.ndarray]:
         """Searches for the given terms in the options.
 
@@ -223,6 +231,11 @@ def query(
             Terms to search for.
         limit: int, default 10
             Amount of closest matches to return.
+        refine_levenshtein: bool, default None
+            Indicates whether results should be refined with Levenshtein distance
+            using TheFuzz.
+            This can increase the accuracy of your results.
+            If not specified, the process's attribute is used.
 
         Parameters
         ----------
@@ -237,13 +250,36 @@ def query(
                 " please index before querying."
             )
         search_matrix = self.vectorizer.transform(search_terms)
-        return self.nearest_neighbours.query(search_matrix, k=limit)
+        indices, distances = self.nearest_neighbours.query(
+            search_matrix, k=limit
+        )
+        if refine_levenshtein is None:
+            refine_levenshtein = self.refine_levenshtein
+        if refine_levenshtein:
+            refined_indices = []
+            refined_distances = []
+            for term, idx in zip(search_terms, indices):
+                options = list(self.options[idx])
+                res = thefuzz_process.extract(
+                    term, options, limit=len(options)
+                )
+                res_indices = []
+                res_dist = []
+                for result_term, result_sim in res:
+                    res_indices.append(idx[options.index(result_term)])
+                    res_dist.append(1 - (result_sim / 100))
+                refined_indices.append(res_indices)
+                refined_distances.append(res_dist)
+            indices = np.stack(refined_indices)
+            distances = np.stack(refined_distances)
+        return indices, distances
 
     def extract(
         self,
         query: str,
         choices: Optional[Iterable[str]] = None,
         limit: int = 10,
+        refine_levenshtein: Optional[bool] = None,
     ) -> List[Tuple[str, int]]:
         """TheFuzz compatible querying.
 
@@ -257,6 +293,11 @@ def extract(
             it will be used for indexing.
         limit: int, default 10
             Number of results to return
+        refine_levenshtein: bool, default None
+            Indicates whether results should be refined with Levenshtein distance
+            using TheFuzz.
+            This can increase the accuracy of your results.
+            If not specified, the process's attribute is used.
 
         Returns
         -------
@@ -271,7 +312,9 @@ def extract(
                     "and no choices were provided."
                 )
             self.index(options=choices)
-        indices, distances = self.query([query], limit=limit)
+        indices, distances = self.query(
+            [query], limit=limit, refine_levenshtein=refine_levenshtein
+        )
         indices = np.ravel(indices)
         distances = np.ravel(distances)
         scores = (1 - distances) * 100

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 line-length=79
 [tool.poetry]
 name = "neofuzz"
-version = "0.2.0"
+version = "0.3.0"
 description = "Blazing fast fuzzy text search for Python."
 authors = ["Márton Kardos <[email protected]>"]
 license = "MIT"
@@ -16,6 +16,7 @@ pynndescent = ">=0.5.0, <0.6.0"
 numpy = ">=0.22.0, <2.0.0"
 tokenizers = ">=0.19.0, <0.20.0"
 joblib = ">=1.4.0, <1.5.0"
+thefuzz = ">=0.22.0, <0.23.0"
 
 
 [build-system]