Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

新增千分位格式数字识别 #803

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package org.ansj.recognition.impl;

import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.recognition.Recognition;

import java.util.*;

/**
* DESC: 千分位格式数字识别(如:1,234,567.11、1,234,567)
*
* @author baicaixiaozhan
* @since v5.1.6
*/
public class ThousandsSeparatorRecognition implements Recognition {

private static final TermNatures THOUSANDS_SEPARATOR_M = new TermNatures(new TermNature("thousands_separator", 1));
private String separator;

public ThousandsSeparatorRecognition() {
this.separator = ",";
}

public ThousandsSeparatorRecognition(String separator) {
this.separator = separator;
}

public String getSeparator() {
return separator;
}

public void setSeparator(String separator) {
this.separator = separator;
}

@Override
public void recognition(Result result) {
List<Term> terms = result.getTerms();
if (terms.isEmpty()) {
return;
}

for (Term term : terms) {
if (term.getOffe() == -1) {
continue;
}

if (Objects.equals(term.termNatures(), TermNatures.M_ALB) && isMatchThousands(term.to())) {
// 处理千分位格式数字
doMerge(term);
term.updateTermNaturesAndNature(THOUSANDS_SEPARATOR_M);

Term to = term.to();
while (isMatchThousands(to)) {
doMerge(term);
to = term.to();
}
}
}

for (Iterator<Term> iterator = terms.iterator(); iterator.hasNext();) {
Term term = iterator.next();
if (term.getOffe() == -1) {
iterator.remove();
}
}
}

private void doMerge(Term term) {
Term to1 = term.to();
term.merage(to1);
to1.setOffe(-1);

Term to2 = term.to();
term.merage(to2);
to2.setOffe(-1);
}

private boolean isMatchThousands(Term term) {
return Objects.equals(term.getName(), separator)
&& (
(term.from().getName().contains(separator) && term.from().getName().indexOf(separator) <= 3)
|| (!term.from().getName().contains(separator) && term.from().getName().length() <= 3)
)
&& Objects.equals(term.to().termNatures(), TermNatures.M_ALB)
&& ((term.to().getName().contains(".") && term.to().getName().indexOf(".") == 3)
|| (!term.to().getName().contains(".") && term.to().getName().length() == 3));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package org.ansj.recognition.impl;

import org.ansj.domain.Result;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.junit.Assert;
import org.junit.Test;


/**
* DESC: 千分位格式数字识别单元测试
*
* @author baicaixiaozhan
* @since v5.1.6
*/
public class ThousandsSeparatorRecognitionTest {

@Test
public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorExisted() {
final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "当日访问量为10,234,543 10000.00。是预期结果";

Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition());

Assert.assertEquals("10,234,543", result.get(3).getName());
Assert.assertEquals("10,234,543/thousands_separator", result.get(3).toString());
Assert.assertEquals("10000.00", result.get(5).getName());
Assert.assertEquals("10000.00/m", result.get(5).toString());
}

@Test
public void test_ThousandsSeparatorRecognition_whenUseCustomSeparator() {
final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "10,234,543 102_234_543.00";

Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT)
.recognition(new ThousandsSeparatorRecognition(","))
.recognition(new ThousandsSeparatorRecognition("_"));

Assert.assertEquals("10,234,543", result.get(0).getName());
Assert.assertEquals("10,234,543/thousands_separator", result.get(0).toString());
Assert.assertEquals("102_234_543.00", result.get(2).getName());
Assert.assertEquals("102_234_543.00/thousands_separator", result.get(2).toString());
}

@Test
public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorIsError() {
Result result1 = ToAnalysis.parse("10,234,5430").recognition(new ThousandsSeparatorRecognition());
Assert.assertEquals("10,234 | , | 5430", result1.toStringWithOutNature(" | "));

Result result2 = ToAnalysis.parse("1088,234,5430").recognition(new ThousandsSeparatorRecognition());
Assert.assertEquals("1088 | , | 234 | , | 5430", result2.toStringWithOutNature(" | "));

Result result3 = ToAnalysis.parse("108,234,5430.00").recognition(new ThousandsSeparatorRecognition());
Assert.assertEquals("108,234 | , | 5430.00", result3.toStringWithOutNature(" | "));

Result result4 = ToAnalysis.parse("108,234.00,430.00").recognition(new ThousandsSeparatorRecognition());
Assert.assertEquals("108,234.00 | , | 430.00", result4.toStringWithOutNature(" | "));
}

@Test
public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorInEnd() {
final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "存在金额:100,234,543.00元";

Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition());

Assert.assertEquals("100,234,543.00元", result.get(3).getName());
}

@Test
public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorInStart() {
final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "100,234,543.00是预期结果";

Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition());

Assert.assertEquals("100,234,543.00", result.get(0).getName());
}

@Test
public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorInCenter() {
final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "当日访问量为10,234,543。是预期结果";

Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition());

Assert.assertEquals("10,234,543", result.get(3).getName());
}

}