Merge pull request #2 from ZhaoDingfan/opennlp-impl

Merge Implementation of OpenNLP branch back to the master branch
openMF · Jun 19, 2018 · feec441 · feec441
2 parents efef490 + 0a0933b
commit feec441
Show file tree

Hide file tree

Showing 30 changed files with 40,499 additions and 88 deletions.
diff --git a/core/src/main/java/org/mifos/chatbot/core/NLPService.java b/core/src/main/java/org/mifos/chatbot/core/NLPService.java
@@ -7,5 +7,5 @@
  * If there are other NLP engines, simply add more interfaces for additional NLP engines
  */
 public interface NLPService {
-    Intent recognize(String text);
+    Intent[] recognize(String text);
 }
diff --git a/core/src/main/java/org/mifos/chatbot/core/model/Intent.java b/core/src/main/java/org/mifos/chatbot/core/model/Intent.java
@@ -1,5 +1,28 @@
 package org.mifos.chatbot.core.model;
 
+// the data holder
 public class Intent {
-    // the data holder
+    // Intent represents a mapping between what a user says and what action your Chatbot should take.
+
+    // provide the confidence level of the recognition
+    // When confidence level is lower than the threshold, then return error feedback
+    private String keyword;
+
+    public Intent(String keyword) {
+        this.keyword = keyword;
+    }
+
+    public String getKeyword() {
+        return keyword;
+    }
+
+    public void setKeyword(String keyword) {
+        this.keyword = keyword;
+    }
+
+    // API category, function and parameters
+
+    // find mifos working on my own machine
+
+    // design mifos API client
 }
diff --git a/nlp/build.gradle b/nlp/build.gradle
@@ -29,13 +29,21 @@ dependencyManagement {
 }
 
 dependencies {
-
     compile('org.springframework.boot:spring-boot-starter')
     testCompile('org.springframework.boot:spring-boot-starter-test')
     testCompile group: 'junit', name: 'junit', version: '4.4'
     compile 'org.apache.opennlp:opennlp-tools:1.8.4'
     compile 'org.springframework:spring-core'
     compile project(':core')
+    testCompile group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.1'
 }
 
+// This configuration excludes all the logging classes
+//configurations {
+//    all*.exclude module : 'spring-boot-starter-logging'
+//}
+
+configurations {
+    compile.exclude group:'ch.qos.logback'
+}
 
diff --git a/nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPModelTrainer.java b/nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPModelTrainer.java
@@ -0,0 +1,73 @@
+package org.mifos.chatbot.nlp;
+
+import opennlp.tools.namefind.*;
+import opennlp.tools.util.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+
+/**
+ * This class is used to train the named entity recognition, which is the model related to financial terminologies
+ * Ideas come from https://www.tutorialkart.com/opennlp/ner-training-in-opennlp-with-name-finder-training-java-example/
+ */
+public class OpenNLPModelTrainer {
+
+    private final Logger logger = LoggerFactory.getLogger(OpenNLPModelTrainer.class);
+
+    // about 15,000 sentences will be enough for the model to output satisfactory named entity.
+    public Boolean train() {
+
+        // Step 1: read the training data
+        InputStreamFactory in = null;
+        try {
+            in = new MarkableFileInputStreamFactory(new File("src/main/resources/TrainingDataFinance-2.txt"));
+        } catch (FileNotFoundException e) {
+            logger.error("FileNotFoundException Step 1 : ", e);
+            return false;
+        }
+
+        ObjectStream inputStream = null;
+        try {
+            inputStream = new NameSampleDataStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+        } catch (IOException e) {
+            logger.error("IOException Step 2 : ", e);
+            return false;
+        }
+
+        // Step 2: prepare training parameters
+        TrainingParameters parameters = new TrainingParameters();
+        parameters.put(TrainingParameters.ITERATIONS_PARAM, 100);
+        parameters.put(TrainingParameters.CUTOFF_PARAM, 1);
+
+        // Step 3: train the model
+        TokenNameFinderModel nameFinderModel = null;
+        try {
+            nameFinderModel = NameFinderME.train("en", null, inputStream, parameters,
+                    TokenNameFinderFactory.create(
+                            null,
+                            null,
+                            Collections.emptyMap(),
+                            new BioCodec()
+                    )
+            );
+        } catch (IOException e) {
+            logger.error("IOException Step 3 : ", e);
+            return false;
+        }
+
+        // Step 4: save the model to a file
+        File output = new File("src/main/resources/models/en-ner-second-try.bin");
+        try {
+             FileOutputStream outputStream = new FileOutputStream(output);
+            nameFinderModel.serialize(outputStream);
+        } catch (IOException e) {
+            logger.error("IOException Step 4 : ", e);
+            return false;
+        }
+
+        return true;
+    }
+}
diff --git a/nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPService.java b/nlp/src/main/java/org/mifos/chatbot/nlp/OpenNLPService.java
@@ -1,21 +1,116 @@
 package org.mifos.chatbot.nlp;
 
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
 import org.mifos.chatbot.core.NLPService;
 import org.mifos.chatbot.core.model.Intent;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 public class OpenNLPService implements NLPService {
 
+    Logger logger = LoggerFactory.getLogger(OpenNLPService.class);
+
+    /**
+     * This method is to recognise user's input to find out what is their intention
+     *
+     * Instead of finding a specific API to handle the result, the recognize function is used to narrow down the scope to several functions,
+     * then apply further processing
+     *
+     * @param text This is the message of user input, it may contain several sentences about their commands
+     * @return user's intention, basically what they really want to do
+     *
+     * @author Zhao Dingfan
+     */
     @Override
-    public Intent recognize(String text) {
+    public Intent[] recognize(String text) {
+        try {
+            String[] sentences = detectSentence(text);
+//            System.out.println(sentences.length);
+
+            List<String> tokens = new ArrayList<>();
+            for(String sentence : sentences) {
+                tokens.addAll(Arrays.asList(tokenize(sentence)));
+            }
+
+            String[] tokenString = new String[tokens.size()];
+            tokenString = tokens.toArray(tokenString);
+            Span[] resultSpans = findName(tokenString);
+
+            Intent[] resultIntents = new Intent[resultSpans.length];
+            for(int i = 0; i < resultIntents.length ; i++) {
+                StringBuilder sb = new StringBuilder();
+                sb.append(tokenString[resultSpans[i].getStart()]);
+                for(int j = resultSpans[i].getStart()+1 ; j < resultSpans[i].getEnd() ; j++) {
+                    sb.append(" ");
+                    sb.append(tokenString[j]);
+                }
+                resultIntents[i] = new Intent(sb.toString());
+            }
+
+//            System.out.println(resultIntents[0].getKeyword());
+            return resultIntents;
+        } catch (IOException e) {
+            logger.error("Cannot read model information : ", e);
+        }
+
         return null;
     }
 
     // Refer to OpenNLP framework as much as I want
 
-    // OpenNLP may has to be trained by myself to be financial focused
     // May start with 10 keywords first, let framework understand different sentence structure.
 
+    private String[] detectSentence(String paragraph) throws IOException {
+        InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin");
+        SentenceModel model = new SentenceModel(is);
+        SentenceDetectorME sdetector = new SentenceDetectorME(model);
 
+        String sentences[] = sdetector.sentDetect(paragraph);
+        for(String sentence : sentences) {
+            System.out.println(sentence);
+        }
+        is.close();
+
+        return sentences;
+    }
 
-    // create training set and training model by myself instead of using pre-trained model
+    private String[] tokenize(String sentence) throws IOException {
+        InputStream is = new FileInputStream("src/test/resources/models/en-token.bin");
+        TokenizerModel model = new TokenizerModel(is);
+        Tokenizer tokenizer = new TokenizerME(model);
+
+        String tokens[] = tokenizer.tokenize(sentence);
+//        for (String a : tokens)
+//            System.out.println(a);
+        is.close();
+
+        return tokens;
+    }
+
+    private Span[] findName(String[] tokens) throws IOException {
+        InputStream is = new FileInputStream("src/test/resources/models/en-ner-second-try.bin");
+        TokenNameFinderModel model = new TokenNameFinderModel(is);
+        NameFinderME nameFinder = new NameFinderME(model);
+
+        Span nameSpans[] = nameFinder.find(tokens);
+
+        System.out.println(nameSpans.length + " spans found. ");
+
+        is.close();
+        return nameSpans;
+    }
 }
diff --git a/nlp/src/main/java/org/mifos/chatbot/nlp/TrainingDataGeneration.java b/nlp/src/main/java/org/mifos/chatbot/nlp/TrainingDataGeneration.java
@@ -0,0 +1,88 @@
+package org.mifos.chatbot.nlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Random;
+
+public class TrainingDataGeneration{
+	public static void main(String[] args) {
+        System.out.println(System.getProperty("user.dir"));
+		dataFileGeneration();
+	}
+
+	private static void dataFileGeneration() {
+		File fout = new File("nlp/src/main/resources/TrainingDataFinance-2.txt");
+		try {
+            FileOutputStream fos = new FileOutputStream(fout);
+            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos));
+
+            for(int i = 0 ; i < 20000 ; i++) {
+                Random random = new Random();
+                String content = generateData(random.nextInt(6), random.nextInt(3), random.nextInt(5));
+                bw.write(content);
+                bw.newLine();
+            }
+
+            bw.close();
+        } catch (IOException e) {
+		    System.out.println("Cannot write to this file");
+        }
+	}
+
+    /**
+     * This function is to generate mock user input
+     * @param firstIdx
+     * @param secondIdx
+     * @param thirdIdx
+     * @return It will returns the training data with respective tags
+     *
+     * @author Dingfan
+     */
+	private static String generateData(int firstIdx, int secondIdx, int thirdIdx) {
+        String[] verbChoices = {"look", "load", "What is", "write to", "how", "update", "delete"};
+        String[] conjChoices = {"my", "the", "the other user's"};
+        String[] nounChoices = {"status of loan", "interest", "outstanding principal", "next due day", "due principal"};
+        StringBuffer sb = new StringBuffer();
+        sb.append(generateVerbTag(verbChoices[firstIdx]));
+        sb.append(" ");
+        sb.append(generateContextTag(conjChoices[secondIdx]));
+        sb.append(generateCategoryTag(nounChoices[thirdIdx]));
+
+        return sb.toString();
+    }
+
+    // Here left a question: I noticed that there are approximately two types of actions, read and write.
+    // Do we recognize these two actions as one type or two different tags?
+
+    // OpenNLP is used for entity extraction,
+    private static String generateVerbTag(String verb) {
+	    StringBuffer sb = new StringBuffer();
+	    sb.append(" <START:action> ");
+	    sb.append(verb);
+	    sb.append(" <END>");
+
+	    return sb.toString();
+    }
+
+    private static String generateContextTag(String context) {
+        StringBuffer sb = new StringBuffer();
+        sb.append(" <START:context> ");
+        sb.append(context);
+        sb.append(" <END>");
+
+        return sb.toString();
+    }
+
+    // For this category issue, it does not need to recognize the day, because it is handled by the Mifos API
+    private static String generateCategoryTag(String category) {
+        StringBuffer sb = new StringBuffer();
+        sb.append(" <START:category> ");
+        sb.append(category);
+        sb.append(" <END>");
+        if(sb.indexOf("day") != -1) {
+            sb.insert(sb.indexOf("day"), " <END> <START:date> ");
+        }
+
+        return sb.toString();
+    }
+}