This repository has been archived by the owner on Aug 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from ZhaoDingfan/opennlp-impl
Merge Implementation of OpenNLP branch back to the master branch
- Loading branch information
Showing
30 changed files
with
40,499 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
25 changes: 24 additions & 1 deletion
25
core/src/main/java/org/mifos/chatbot/core/model/Intent.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,28 @@ | ||
package org.mifos.chatbot.core.model; | ||
|
||
// the data holder | ||
public class Intent { | ||
// the data holder | ||
// Intent represents a mapping between what a user says and what action your Chatbot should take. | ||
|
||
// provide the confidence level of the recognition | ||
// When confidence level is lower than the threshold, then return error feedback | ||
private String keyword; | ||
|
||
public Intent(String keyword) { | ||
this.keyword = keyword; | ||
} | ||
|
||
public String getKeyword() { | ||
return keyword; | ||
} | ||
|
||
public void setKeyword(String keyword) { | ||
this.keyword = keyword; | ||
} | ||
|
||
// API category, function and parameters | ||
|
||
// find mifos working on my own machine | ||
|
||
// design mifos API client | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
73 changes: 73 additions & 0 deletions
73
nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPModelTrainer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
package org.mifos.chatbot.nlp; | ||
|
||
import opennlp.tools.namefind.*; | ||
import opennlp.tools.util.*; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.*; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.Collections; | ||
|
||
/** | ||
* This class is used to train the named entity recognition, which is the model related to financial terminologies | ||
* Ideas come from https://www.tutorialkart.com/opennlp/ner-training-in-opennlp-with-name-finder-training-java-example/ | ||
*/ | ||
public class OpenNLPModelTrainer { | ||
|
||
private final Logger logger = LoggerFactory.getLogger(OpenNLPModelTrainer.class); | ||
|
||
// about 15,000 sentences will be enough for the model to output satisfactory named entity. | ||
public Boolean train() { | ||
|
||
// Step 1: read the training data | ||
InputStreamFactory in = null; | ||
try { | ||
in = new MarkableFileInputStreamFactory(new File("src/main/resources/TrainingDataFinance-2.txt")); | ||
} catch (FileNotFoundException e) { | ||
logger.error("FileNotFoundException Step 1 : ", e); | ||
return false; | ||
} | ||
|
||
ObjectStream inputStream = null; | ||
try { | ||
inputStream = new NameSampleDataStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); | ||
} catch (IOException e) { | ||
logger.error("IOException Step 2 : ", e); | ||
return false; | ||
} | ||
|
||
// Step 2: prepare training parameters | ||
TrainingParameters parameters = new TrainingParameters(); | ||
parameters.put(TrainingParameters.ITERATIONS_PARAM, 100); | ||
parameters.put(TrainingParameters.CUTOFF_PARAM, 1); | ||
|
||
// Step 3: train the model | ||
TokenNameFinderModel nameFinderModel = null; | ||
try { | ||
nameFinderModel = NameFinderME.train("en", null, inputStream, parameters, | ||
TokenNameFinderFactory.create( | ||
null, | ||
null, | ||
Collections.emptyMap(), | ||
new BioCodec() | ||
) | ||
); | ||
} catch (IOException e) { | ||
logger.error("IOException Step 3 : ", e); | ||
return false; | ||
} | ||
|
||
// Step 4: save the model to a file | ||
File output = new File("src/main/resources/models/en-ner-second-try.bin"); | ||
try { | ||
FileOutputStream outputStream = new FileOutputStream(output); | ||
nameFinderModel.serialize(outputStream); | ||
} catch (IOException e) { | ||
logger.error("IOException Step 4 : ", e); | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
} |
101 changes: 98 additions & 3 deletions
101
nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,116 @@ | ||
package org.mifos.chatbot.nlp; | ||
|
||
import opennlp.tools.namefind.NameFinderME; | ||
import opennlp.tools.namefind.TokenNameFinderModel; | ||
import opennlp.tools.sentdetect.SentenceDetectorME; | ||
import opennlp.tools.sentdetect.SentenceModel; | ||
import opennlp.tools.tokenize.Tokenizer; | ||
import opennlp.tools.tokenize.TokenizerME; | ||
import opennlp.tools.tokenize.TokenizerModel; | ||
import opennlp.tools.util.Span; | ||
import org.mifos.chatbot.core.NLPService; | ||
import org.mifos.chatbot.core.model.Intent; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
public class OpenNLPService implements NLPService { | ||
|
||
Logger logger = LoggerFactory.getLogger(OpenNLPService.class); | ||
|
||
/** | ||
* This method is to recognise user's input to find out what is their intention | ||
* | ||
* Instead of finding a specific API to handle the result, the recognize function is used to narrow down the scope to several functions, | ||
* then apply further processing | ||
* | ||
* @param text This is the message of user input, it may contain several sentences about their commands | ||
* @return user's intention, basically what they really want to do | ||
* | ||
* @author Zhao Dingfan | ||
*/ | ||
@Override | ||
public Intent recognize(String text) { | ||
public Intent[] recognize(String text) { | ||
try { | ||
String[] sentences = detectSentence(text); | ||
// System.out.println(sentences.length); | ||
|
||
List<String> tokens = new ArrayList<>(); | ||
for(String sentence : sentences) { | ||
tokens.addAll(Arrays.asList(tokenize(sentence))); | ||
} | ||
|
||
String[] tokenString = new String[tokens.size()]; | ||
tokenString = tokens.toArray(tokenString); | ||
Span[] resultSpans = findName(tokenString); | ||
|
||
Intent[] resultIntents = new Intent[resultSpans.length]; | ||
for(int i = 0; i < resultIntents.length ; i++) { | ||
StringBuilder sb = new StringBuilder(); | ||
sb.append(tokenString[resultSpans[i].getStart()]); | ||
for(int j = resultSpans[i].getStart()+1 ; j < resultSpans[i].getEnd() ; j++) { | ||
sb.append(" "); | ||
sb.append(tokenString[j]); | ||
} | ||
resultIntents[i] = new Intent(sb.toString()); | ||
} | ||
|
||
// System.out.println(resultIntents[0].getKeyword()); | ||
return resultIntents; | ||
} catch (IOException e) { | ||
logger.error("Cannot read model information : ", e); | ||
} | ||
|
||
return null; | ||
} | ||
|
||
// Refer to OpenNLP framework as much as I want | ||
|
||
// OpenNLP may has to be trained by myself to be financial focused | ||
// May start with 10 keywords first, let framework understand different sentence structure. | ||
|
||
private String[] detectSentence(String paragraph) throws IOException { | ||
InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin"); | ||
SentenceModel model = new SentenceModel(is); | ||
SentenceDetectorME sdetector = new SentenceDetectorME(model); | ||
|
||
String sentences[] = sdetector.sentDetect(paragraph); | ||
for(String sentence : sentences) { | ||
System.out.println(sentence); | ||
} | ||
is.close(); | ||
|
||
return sentences; | ||
} | ||
|
||
// create training set and training model by myself instead of using pre-trained model | ||
private String[] tokenize(String sentence) throws IOException { | ||
InputStream is = new FileInputStream("src/test/resources/models/en-token.bin"); | ||
TokenizerModel model = new TokenizerModel(is); | ||
Tokenizer tokenizer = new TokenizerME(model); | ||
|
||
String tokens[] = tokenizer.tokenize(sentence); | ||
// for (String a : tokens) | ||
// System.out.println(a); | ||
is.close(); | ||
|
||
return tokens; | ||
} | ||
|
||
private Span[] findName(String[] tokens) throws IOException { | ||
InputStream is = new FileInputStream("src/test/resources/models/en-ner-second-try.bin"); | ||
TokenNameFinderModel model = new TokenNameFinderModel(is); | ||
NameFinderME nameFinder = new NameFinderME(model); | ||
|
||
Span nameSpans[] = nameFinder.find(tokens); | ||
|
||
System.out.println(nameSpans.length + " spans found. "); | ||
|
||
is.close(); | ||
return nameSpans; | ||
} | ||
} |
88 changes: 88 additions & 0 deletions
88
nlp/src/main/java/org/mifos/chatbot/nlp/TrainingDataGeneration.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
package org.mifos.chatbot.nlp; | ||
|
||
import java.io.*; | ||
import java.util.ArrayList; | ||
import java.util.Random; | ||
|
||
public class TrainingDataGeneration{ | ||
public static void main(String[] args) { | ||
System.out.println(System.getProperty("user.dir")); | ||
dataFileGeneration(); | ||
} | ||
|
||
private static void dataFileGeneration() { | ||
File fout = new File("nlp/src/main/resources/TrainingDataFinance-2.txt"); | ||
try { | ||
FileOutputStream fos = new FileOutputStream(fout); | ||
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos)); | ||
|
||
for(int i = 0 ; i < 20000 ; i++) { | ||
Random random = new Random(); | ||
String content = generateData(random.nextInt(6), random.nextInt(3), random.nextInt(5)); | ||
bw.write(content); | ||
bw.newLine(); | ||
} | ||
|
||
bw.close(); | ||
} catch (IOException e) { | ||
System.out.println("Cannot write to this file"); | ||
} | ||
} | ||
|
||
/** | ||
* This function is to generate mock user input | ||
* @param firstIdx | ||
* @param secondIdx | ||
* @param thirdIdx | ||
* @return It will returns the training data with respective tags | ||
* | ||
* @author Dingfan | ||
*/ | ||
private static String generateData(int firstIdx, int secondIdx, int thirdIdx) { | ||
String[] verbChoices = {"look", "load", "What is", "write to", "how", "update", "delete"}; | ||
String[] conjChoices = {"my", "the", "the other user's"}; | ||
String[] nounChoices = {"status of loan", "interest", "outstanding principal", "next due day", "due principal"}; | ||
StringBuffer sb = new StringBuffer(); | ||
sb.append(generateVerbTag(verbChoices[firstIdx])); | ||
sb.append(" "); | ||
sb.append(generateContextTag(conjChoices[secondIdx])); | ||
sb.append(generateCategoryTag(nounChoices[thirdIdx])); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
// Here left a question: I noticed that there are approximately two types of actions, read and write. | ||
// Do we recognize these two actions as one type or two different tags? | ||
|
||
// OpenNLP is used for entity extraction, | ||
private static String generateVerbTag(String verb) { | ||
StringBuffer sb = new StringBuffer(); | ||
sb.append(" <START:action> "); | ||
sb.append(verb); | ||
sb.append(" <END>"); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
private static String generateContextTag(String context) { | ||
StringBuffer sb = new StringBuffer(); | ||
sb.append(" <START:context> "); | ||
sb.append(context); | ||
sb.append(" <END>"); | ||
|
||
return sb.toString(); | ||
} | ||
|
||
// For this category issue, it does not need to recognize the day, because it is handled by the Mifos API | ||
private static String generateCategoryTag(String category) { | ||
StringBuffer sb = new StringBuffer(); | ||
sb.append(" <START:category> "); | ||
sb.append(category); | ||
sb.append(" <END>"); | ||
if(sb.indexOf("day") != -1) { | ||
sb.insert(sb.indexOf("day"), " <END> <START:date> "); | ||
} | ||
|
||
return sb.toString(); | ||
} | ||
} |
Oops, something went wrong.