Skip to content
This repository has been archived by the owner on Aug 29, 2023. It is now read-only.

Commit

Permalink
Merge pull request #2 from ZhaoDingfan/opennlp-impl
Browse files Browse the repository at this point in the history
Merge Implementation of OpenNLP branch back to the master branch
  • Loading branch information
dfz2019 authored Jun 19, 2018
2 parents efef490 + 0a0933b commit feec441
Show file tree
Hide file tree
Showing 30 changed files with 40,499 additions and 88 deletions.
2 changes: 1 addition & 1 deletion core/src/main/java/org/mifos/chatbot/core/NLPService.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
* If there are other NLP engines, simply add more interfaces for additional NLP engines
*/
public interface NLPService {
Intent recognize(String text);
Intent[] recognize(String text);
}
25 changes: 24 additions & 1 deletion core/src/main/java/org/mifos/chatbot/core/model/Intent.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
package org.mifos.chatbot.core.model;

// the data holder
public class Intent {
// the data holder
// Intent represents a mapping between what a user says and what action your Chatbot should take.

// provide the confidence level of the recognition
// When confidence level is lower than the threshold, then return error feedback
private String keyword;

public Intent(String keyword) {
this.keyword = keyword;
}

public String getKeyword() {
return keyword;
}

public void setKeyword(String keyword) {
this.keyword = keyword;
}

// API category, function and parameters

// find mifos working on my own machine

// design mifos API client
}
10 changes: 9 additions & 1 deletion nlp/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,21 @@ dependencyManagement {
}

dependencies {

compile('org.springframework.boot:spring-boot-starter')
testCompile('org.springframework.boot:spring-boot-starter-test')
testCompile group: 'junit', name: 'junit', version: '4.4'
compile 'org.apache.opennlp:opennlp-tools:1.8.4'
compile 'org.springframework:spring-core'
compile project(':core')
testCompile group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.1'
}

// This configuration excludes all the logging classes
//configurations {
// all*.exclude module : 'spring-boot-starter-logging'
//}

configurations {
compile.exclude group:'ch.qos.logback'
}

73 changes: 73 additions & 0 deletions nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPModelTrainer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package org.mifos.chatbot.nlp;

import opennlp.tools.namefind.*;
import opennlp.tools.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Collections;

/**
* This class is used to train the named entity recognition, which is the model related to financial terminologies
* Ideas come from https://www.tutorialkart.com/opennlp/ner-training-in-opennlp-with-name-finder-training-java-example/
*/
public class OpenNLPModelTrainer {

private final Logger logger = LoggerFactory.getLogger(OpenNLPModelTrainer.class);

// about 15,000 sentences will be enough for the model to output satisfactory named entity.
public Boolean train() {

// Step 1: read the training data
InputStreamFactory in = null;
try {
in = new MarkableFileInputStreamFactory(new File("src/main/resources/TrainingDataFinance-2.txt"));
} catch (FileNotFoundException e) {
logger.error("FileNotFoundException Step 1 : ", e);
return false;
}

ObjectStream inputStream = null;
try {
inputStream = new NameSampleDataStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
} catch (IOException e) {
logger.error("IOException Step 2 : ", e);
return false;
}

// Step 2: prepare training parameters
TrainingParameters parameters = new TrainingParameters();
parameters.put(TrainingParameters.ITERATIONS_PARAM, 100);
parameters.put(TrainingParameters.CUTOFF_PARAM, 1);

// Step 3: train the model
TokenNameFinderModel nameFinderModel = null;
try {
nameFinderModel = NameFinderME.train("en", null, inputStream, parameters,
TokenNameFinderFactory.create(
null,
null,
Collections.emptyMap(),
new BioCodec()
)
);
} catch (IOException e) {
logger.error("IOException Step 3 : ", e);
return false;
}

// Step 4: save the model to a file
File output = new File("src/main/resources/models/en-ner-second-try.bin");
try {
FileOutputStream outputStream = new FileOutputStream(output);
nameFinderModel.serialize(outputStream);
} catch (IOException e) {
logger.error("IOException Step 4 : ", e);
return false;
}

return true;
}
}
101 changes: 98 additions & 3 deletions nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPService.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,116 @@
package org.mifos.chatbot.nlp;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import org.mifos.chatbot.core.NLPService;
import org.mifos.chatbot.core.model.Intent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class OpenNLPService implements NLPService {

Logger logger = LoggerFactory.getLogger(OpenNLPService.class);

/**
* This method is to recognise user's input to find out what is their intention
*
* Instead of finding a specific API to handle the result, the recognize function is used to narrow down the scope to several functions,
* then apply further processing
*
* @param text This is the message of user input, it may contain several sentences about their commands
* @return user's intention, basically what they really want to do
*
* @author Zhao Dingfan
*/
@Override
public Intent recognize(String text) {
public Intent[] recognize(String text) {
try {
String[] sentences = detectSentence(text);
// System.out.println(sentences.length);

List<String> tokens = new ArrayList<>();
for(String sentence : sentences) {
tokens.addAll(Arrays.asList(tokenize(sentence)));
}

String[] tokenString = new String[tokens.size()];
tokenString = tokens.toArray(tokenString);
Span[] resultSpans = findName(tokenString);

Intent[] resultIntents = new Intent[resultSpans.length];
for(int i = 0; i < resultIntents.length ; i++) {
StringBuilder sb = new StringBuilder();
sb.append(tokenString[resultSpans[i].getStart()]);
for(int j = resultSpans[i].getStart()+1 ; j < resultSpans[i].getEnd() ; j++) {
sb.append(" ");
sb.append(tokenString[j]);
}
resultIntents[i] = new Intent(sb.toString());
}

// System.out.println(resultIntents[0].getKeyword());
return resultIntents;
} catch (IOException e) {
logger.error("Cannot read model information : ", e);
}

return null;
}

// Refer to OpenNLP framework as much as I want

// OpenNLP may has to be trained by myself to be financial focused
// May start with 10 keywords first, let framework understand different sentence structure.

private String[] detectSentence(String paragraph) throws IOException {
InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin");
SentenceModel model = new SentenceModel(is);
SentenceDetectorME sdetector = new SentenceDetectorME(model);

String sentences[] = sdetector.sentDetect(paragraph);
for(String sentence : sentences) {
System.out.println(sentence);
}
is.close();

return sentences;
}

// create training set and training model by myself instead of using pre-trained model
private String[] tokenize(String sentence) throws IOException {
InputStream is = new FileInputStream("src/test/resources/models/en-token.bin");
TokenizerModel model = new TokenizerModel(is);
Tokenizer tokenizer = new TokenizerME(model);

String tokens[] = tokenizer.tokenize(sentence);
// for (String a : tokens)
// System.out.println(a);
is.close();

return tokens;
}

private Span[] findName(String[] tokens) throws IOException {
InputStream is = new FileInputStream("src/test/resources/models/en-ner-second-try.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
NameFinderME nameFinder = new NameFinderME(model);

Span nameSpans[] = nameFinder.find(tokens);

System.out.println(nameSpans.length + " spans found. ");

is.close();
return nameSpans;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package org.mifos.chatbot.nlp;

import java.io.*;
import java.util.ArrayList;
import java.util.Random;

public class TrainingDataGeneration{
public static void main(String[] args) {
System.out.println(System.getProperty("user.dir"));
dataFileGeneration();
}

private static void dataFileGeneration() {
File fout = new File("nlp/src/main/resources/TrainingDataFinance-2.txt");
try {
FileOutputStream fos = new FileOutputStream(fout);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos));

for(int i = 0 ; i < 20000 ; i++) {
Random random = new Random();
String content = generateData(random.nextInt(6), random.nextInt(3), random.nextInt(5));
bw.write(content);
bw.newLine();
}

bw.close();
} catch (IOException e) {
System.out.println("Cannot write to this file");
}
}

/**
* This function is to generate mock user input
* @param firstIdx
* @param secondIdx
* @param thirdIdx
* @return It will returns the training data with respective tags
*
* @author Dingfan
*/
private static String generateData(int firstIdx, int secondIdx, int thirdIdx) {
String[] verbChoices = {"look", "load", "What is", "write to", "how", "update", "delete"};
String[] conjChoices = {"my", "the", "the other user's"};
String[] nounChoices = {"status of loan", "interest", "outstanding principal", "next due day", "due principal"};
StringBuffer sb = new StringBuffer();
sb.append(generateVerbTag(verbChoices[firstIdx]));
sb.append(" ");
sb.append(generateContextTag(conjChoices[secondIdx]));
sb.append(generateCategoryTag(nounChoices[thirdIdx]));

return sb.toString();
}

// Here left a question: I noticed that there are approximately two types of actions, read and write.
// Do we recognize these two actions as one type or two different tags?

// OpenNLP is used for entity extraction,
private static String generateVerbTag(String verb) {
StringBuffer sb = new StringBuffer();
sb.append(" <START:action> ");
sb.append(verb);
sb.append(" <END>");

return sb.toString();
}

private static String generateContextTag(String context) {
StringBuffer sb = new StringBuffer();
sb.append(" <START:context> ");
sb.append(context);
sb.append(" <END>");

return sb.toString();
}

// For this category issue, it does not need to recognize the day, because it is handled by the Mifos API
private static String generateCategoryTag(String category) {
StringBuffer sb = new StringBuffer();
sb.append(" <START:category> ");
sb.append(category);
sb.append(" <END>");
if(sb.indexOf("day") != -1) {
sb.insert(sb.indexOf("day"), " <END> <START:date> ");
}

return sb.toString();
}
}
Loading

0 comments on commit feec441

Please sign in to comment.