Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

将 @node-rs/jieba 更改为 WASM 版本以实现 0 占用规范 #343

Merged
merged 4 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/long-memory/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
],
"dependencies": {
"@langchain/core": "^0.3.18",
"@node-rs/jieba": "^2.0.1",
"jieba-wasm": "^2.2.0",
"stopwords-iso": "^1.1.0",
"tiny-segmenter": "^0.2.0"
},
Expand Down
60 changes: 35 additions & 25 deletions packages/long-memory/src/similarity.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import { Jieba } from '@node-rs/jieba'
import { dict } from '@node-rs/jieba/dict'
import { cut } from 'jieba-wasm'
import TinySegmenter from 'tiny-segmenter'

Check warning on line 2 in packages/long-memory/src/similarity.ts

View workflow job for this annotation

GitHub Actions / lint

Import name `TinySegmenter` must match one of the following formats: camelCase, UPPER_CASE
import stopwords from 'stopwords-iso'

const jieba = Jieba.withDict(dict)
const segmenter = new TinySegmenter()

const SIMILARITY_WEIGHTS = {
Expand Down Expand Up @@ -79,7 +77,7 @@
let currentText = text

if (languages.has('zh')) {
const zhTokens = jieba.cut(currentText, false)
const zhTokens = cut(currentText, false)
currentText = zhTokens.join('▲')
}

Expand Down Expand Up @@ -188,37 +186,49 @@
}

private static calculateBM25Similarity(s1: string, s2: string): number {
const k1 = 1.5
const b = 0.75
const k1 = 1.5 // 词频饱和参数
const b = 0.75 // 文档长度归一化参数
const epsilon = 0.25 // 平滑因子

const tokens1 = TextTokenizer.tokenize(s1)
const tokens2 = TextTokenizer.tokenize(s2)

const docLength = tokens2.length
const avgDocLength = (tokens1.length + tokens2.length) / 2
const doc1Length = tokens1.length
const doc2Length = tokens2.length
const avgDocLength = (doc1Length + doc2Length) / 2

const termFreqDoc1 = new Map<string, number>()
const termFreqDoc2 = new Map<string, number>()

const uniqueTerms = new Set([...tokens1, ...tokens2])

const termFrequencies = new Map<string, number>()
tokens1.forEach((token) => {
termFrequencies.set(token, (termFrequencies.get(token) || 0) + 1)
termFreqDoc1.set(token, (termFreqDoc1.get(token) || 0) + 1)
})

tokens2.forEach((token) => {
termFreqDoc2.set(token, (termFreqDoc2.get(token) || 0) + 1)
})

let score = 0
for (const [term, tf] of termFrequencies) {
const termFreqInDoc2 = tokens2.filter((t) => t === term).length
if (termFreqInDoc2 === 0) continue

const idf = Math.log(
1 +
Math.abs(tokens1.length - termFreqInDoc2 + 0.5) /
(termFreqInDoc2 + 0.5)
)
const numerator = tf * (k1 + 1)
const denominator =
tf + k1 * (1 - b + b * (docLength / avgDocLength))

score += idf * (numerator / denominator)

// 计算每个词条的 BM25 得分
for (const term of uniqueTerms) {
const tf = termFreqDoc1.get(term) || 0
const docFreq = (termFreqDoc2.get(term) || 0) > 0 ? 1 : 0

const idf = Math.log((2 - docFreq + epsilon) / (docFreq + epsilon))

if (tf > 0) {
const numerator = tf * (k1 + 1)
const denominator =
tf + k1 * (1 - b + b * (doc2Length / avgDocLength))
score += idf * (numerator / denominator)
}
}

return score / tokens1.length
const maxPossibleScore = Math.log(2) * doc1Length // 理论最大得分
return score / maxPossibleScore
}

public static calculate(str1: string, str2: string): SimilarityResult {
Expand Down
Loading