From bb2569e8b321d086a50ecd49d34cf322e7ead0de Mon Sep 17 00:00:00 2001 From: Hugo ter Doest Date: Sat, 6 Jul 2024 23:35:22 +0200 Subject: [PATCH] Add removeDocument to TFIDF (#749) * Add removeDocument to TFIDF * Bug * Typo --- lib/natural/tfidf/tfidf.js | 19 +++++++++++++++++++ spec/tfidf_spec.ts | 14 ++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/lib/natural/tfidf/tfidf.js b/lib/natural/tfidf/tfidf.js index 7faf42c24..d67a30352 100644 --- a/lib/natural/tfidf/tfidf.js +++ b/lib/natural/tfidf/tfidf.js @@ -132,6 +132,25 @@ class TfIdf { } } + // Remove a document from the corpus + // Returns true if the document was found + // Returns false if the document was not found + removeDocument (key) { + // Find the document + const index = this.documents.findIndex(function (document) { + return document.__key === key + }) + // If found, remove it + if (index > -1) { + this.documents.splice(index, 1) + // Invalidate the cache + this._idfCache = Object.create(null) + return true + } + + return false + } + // If restoreCache is set to true, all terms idf scores currently cached will be recomputed. // Otherwise, the cache will just be wiped clean addFileSync (path, encoding, key, restoreCache) { diff --git a/spec/tfidf_spec.ts b/spec/tfidf_spec.ts index 10b44d968..eeadecc3f 100644 --- a/spec/tfidf_spec.ts +++ b/spec/tfidf_spec.ts @@ -283,4 +283,18 @@ describe('tfidf', function () { expect(tfidf.setStopwords(stopwords)).toEqual(false) }) }) + + describe('Remove documents', function () { + it('should remove a document', function () { + tfidf = new TfIdf() + + tfidf.addDocument('this document is about node.', 0) + tfidf.addDocument('this document isn\'t about node.', 1) + + const result1 = tfidf.removeDocument(0) + expect(result1).toEqual(true) + const result2 = tfidf.removeDocument(0) + expect(result2).toEqual(false) + }) + }) })