Skip to content

Commit

Permalink
Merge pull request #2159 from openturing/feature/ac-stopword-improve
Browse files Browse the repository at this point in the history
Feature/ac stopword improve
  • Loading branch information
alegauss authored Oct 2, 2024
2 parents e054bac + d0b8801 commit 3413263
Show file tree
Hide file tree
Showing 5 changed files with 429 additions and 100 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public TurSEStopWord(ResourceLoader resourceloader) {
private static final String STOP_WORD_CLASS_FILTER = "solr.StopFilterFactory";
private static final String WORDS_ATTRIBUTE = "words";
private static final String DEFAULT_STOP_WORD_FILE = "classpath:/solr/conf/lang/stopwords.txt";
private static final String APPLICATION_OCTET_STREAM_UTF8 = "application/octet-stream;charset:utf-8";
private static final String APPLICATION_OCTET_STREAM_UTF8 = "application/octet-stream;charset=utf-8";
private static final String ADMIN_FILE_URL = "%s/admin/file?contentType=%s&file=%s";

public List<String> getStopWords(TurSolrInstance turSolrInstance) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
package com.viglet.turing.sn.ac;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;

import java.util.*;

/**
* The SuggestionAutomaton class represents a finite state machine used to
* validate suggestions.
* <p>
* Methods:
* </p>
* <ul>
* <li>{@link #SuggestionAutomaton()}: Constructor that builds the
* automaton.</li>
* <li>{@link #run(String, int, List)}: Runs the automaton on a given
* suggestion, number of words from query, and stop words list.</li>
* <li>{@link #getTokenType(List, String, boolean)}: Determines the token type
* of a given token.</li>
* </ul>
*
* </pre>
*
* @author Gabriel F. Gomazako
* @since 0.3.9
*/
@Slf4j
public class SuggestionAutomaton {
private State initialState;

public SuggestionAutomaton() {
buildAutomaton();
}

/**
* This constructs the finite state machine structure and binds to <code>initialState</code> parameter.
*/
private void buildAutomaton() {
State n0 = new State("N0");
State n1 = new State("N1");
State n2 = new State("N2");
State n3 = new State("N3");
State accept = new State("Accept", State.StateType.ACCEPT);
State reject = new State("Error", State.StateType.REJECT);

n0.transitions.put(TokenType.WORD, n1);
n0.transitions.put(TokenType.STOP_WORD, reject);
n0.transitions.put(TokenType.EMPTY, reject);
n0.transitions.put(TokenType.SPECIAL_STOP_WORD, n2);

n1.transitions.put(TokenType.WORD, reject);
n1.transitions.put(TokenType.EMPTY, accept);
n1.transitions.put(TokenType.STOP_WORD, reject);
n1.transitions.put(TokenType.SPECIAL_STOP_WORD, n2);

n2.transitions.put(TokenType.WORD, n3);
n2.transitions.put(TokenType.STOP_WORD, n2);
n2.transitions.put(TokenType.EMPTY, reject);
n2.transitions.put(TokenType.SPECIAL_STOP_WORD, n2);

n3.transitions.put(TokenType.WORD, reject);
n3.transitions.put(TokenType.STOP_WORD, reject);
n3.transitions.put(TokenType.EMPTY, accept);
n3.transitions.put(TokenType.SPECIAL_STOP_WORD, reject);

this.initialState = n0;
}

/**
* Runs the suggestion automaton to determine if a given suggestion is valid.
*
* @param suggestion the suggestion string to be evaluated.
* @param numberOfWordsFromQuery the number of words from current query. It will be used to know how many words the suggestion should have.
* @param stopWords a list of stop words.
* @return {@code true} if the suggestion is valid according to the automaton rules, {@code false} otherwise.
*/
public boolean run(String suggestion, int numberOfWordsFromQuery, List<String> stopWords) {
// TOP -> [ "Hello", "World" ]
Deque<String> tokensDeque = new ArrayDeque<>(List.of(suggestion.split(" ")));

// Suggestions should not start with a stop word when is the first term of the query.
if (stopWords.contains(tokensDeque.peek()) && numberOfWordsFromQuery == 1) {
return false;
}

// The suggestions will always include the query, so we need to ignore it.
int wordsToRemove = numberOfWordsFromQuery - 1;
// Query: "Hello my friend" -> numberOfWordsFromQuery = 3
// Query: "Hello my friend " -> numberOfWordsFromQuery = 4
while (wordsToRemove > 0 && !tokensDeque.isEmpty()) {
tokensDeque.pop();
wordsToRemove--;
}

// Checks if now it starts with a stop word
boolean firstTokenIsStopWord = stopWords.contains(tokensDeque.peek());

if (tokensDeque.isEmpty()) {
log.warn("Suggestion is empty.");
return false;
}

TokenType currentTokenType = null;
State currentState = this.initialState;
log.info("Testing suggestion: {}", suggestion);
String currentToken = null;
while (true) {
if (currentState.stateType == State.StateType.REJECT) {
return false;
} else if (currentState.stateType == State.StateType.ACCEPT) {
return true;
}

currentToken = tokensDeque.poll();
currentTokenType = getTokenType(stopWords, currentToken, firstTokenIsStopWord);

log.info("Current token: {} - Type: {}", currentToken, currentTokenType);
log.info("Current state: {}", currentState.name);

currentState = currentState.getNextState(currentTokenType);
}
}

@NotNull
private TokenType getTokenType(List<String> stopWords, String currentToken, boolean firstTokenIsStopWord) {
TokenType tokenType;
if (StringUtils.isEmpty(currentToken)) {
return TokenType.EMPTY;
}
if (stopWords.contains(currentToken)) {
tokenType = TokenType.STOP_WORD;
if (firstTokenIsStopWord)
tokenType = TokenType.SPECIAL_STOP_WORD;
} else {
tokenType = TokenType.WORD;
}
return tokenType;
}

private enum TokenType {
WORD,
STOP_WORD,
SPECIAL_STOP_WORD,
EMPTY
}

private class State {
private Map<TokenType, State> transitions = new EnumMap<>(TokenType.class);
private StateType stateType;
private String name;

private State(String name) {
this.stateType = StateType.NORMAL;
this.name = name;
}

private State(String name, StateType stateType) {
this.stateType = stateType;
this.name = name;
}

private State getNextState(TokenType tokenType) {
return this.transitions.get(tokenType);
}

private enum StateType {
ACCEPT,
REJECT,
NORMAL
}
}

}
171 changes: 171 additions & 0 deletions turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
package com.viglet.turing.sn.ac;

import lombok.extern.slf4j.Slf4j;

import java.util.*;

/**
* The {@code SuggestionFilter} class is responsible for filtering a list of
* suggestions
* based on a specified strategy and a list of stop words. It currently supports
* two
* strategies: {@code DEFAULT} and {@code AUTOMATON}.
* <p>
* <ul>
* <li>The {@code DEFAULT} strategy follows the original filter implemented in
* "TurSNAutoComplete".</li>
* <li>The {@code AUTOMATON} strategy uses a finite state machine to filter
* suggestions. The behaviour of this strategy filters
* suggestion for a single word, if the query has an space at the end, it will
* consider the query as a two-word query and will suggests
* a next word for the query. It will consider a stop word followed by a
* non-stop as a valid suggestion.</li>
* </ul>
*
* @author Gabriel F. Gomazako
* @since 0.3.9
*/
@Slf4j
public class SuggestionFilter {

private static final String SPACE_CHAR = " ";
private final List<String> stopWords;
private int numberOfWordsFromQuery = 0;
private SuggestionFilterStrategy strategy;
private boolean useTermsQueryEqualsAutoComplete = true;

public SuggestionFilter(List<String> stopWords) {
this.stopWords = stopWords;
}

/**
* Configures the suggestion filter to use the default strategy - Legacy
* strategy.
*
* @param numberOfWordsFromQuery the number of words from the query to be
* considered in the suggestion filter.
*/
public void defaultStrategyConfig(int numberOfWordsFromQuery) {
this.strategy = SuggestionFilterStrategy.DEFAULT;
this.numberOfWordsFromQuery = numberOfWordsFromQuery;
}

/**
* Configures the default strategy for suggestions - Legacy strategy.
*
* @param numberOfWordsFromQuery the number of words to consider from
* the query.
* @param useTermsQueryEqualsAutoComplete flag indicating whether to use terms
* query equals auto-complete.
*/
public void defaultStrategyConfig(int numberOfWordsFromQuery, boolean useTermsQueryEqualsAutoComplete) {
this.defaultStrategyConfig(numberOfWordsFromQuery);
this.useTermsQueryEqualsAutoComplete = useTermsQueryEqualsAutoComplete;
}

/**
* Configures the suggestion filter to use the automaton strategy - New
* strategy.
*
* @param numberOfWordsFromQuery the number of words from the query to be used
* in the automaton strategy
*/
public void automatonStrategyConfig(int numberOfWordsFromQuery) {
this.strategy = SuggestionFilterStrategy.AUTOMATON;
this.numberOfWordsFromQuery = numberOfWordsFromQuery;
}

/**
* Filters a list of suggestions based on the defined strategy.
*
* @param suggestions the list of suggestions to be filtered
* @return a list of filtered suggestions based on the strategy
* @throws IllegalArgumentException if the suggestions list is null
*/
public List<String> filter(List<String> suggestions) {
if (suggestions == null) {
throw new IllegalArgumentException("Suggestions list is null.");
}

List<String> suggestionsFiltered = new ArrayList<>();
switch (this.strategy) {
case DEFAULT:
for (String suggestion : suggestions) {
if (defaultStrategy(suggestion)) {
suggestionsFiltered.add(suggestion);
}
}
break;
case AUTOMATON:
SuggestionAutomaton automaton = new SuggestionAutomaton();
for (String suggestion : suggestions) {
if (automaton.run(suggestion, numberOfWordsFromQuery, stopWords)) {
suggestionsFiltered.add(suggestion);
}
}
break;
default:
log.warn("No strategy defined. Returning empty list.");
return Collections.emptyList();
}

return suggestionsFiltered;
}

public SuggestionFilterStrategy getStrategy() {
return this.strategy;
}

public void setStrategy(SuggestionFilterStrategy strategy) {
this.strategy = strategy;
}

private boolean defaultStrategy(String suggestion) {
validateDefaultStrategyConfig();

// Example: Query: "Hello" suggestion: "Hello World" suggestion.split =
// ["Hello", "World"].length = 2
String[] suggestionTokens = suggestion.split(SPACE_CHAR);
int numberOfWordsFromAutoCompleteItem = suggestionTokens.length;

// Case: Autocompletes the current word being typed.
// Example: Query: "Hel" suggestions = ["Hello", "Hello world", "help"] Filtered
// suggestions = ["Hello", "help"]
// if query ends with space, number of words from query will have an extra word.
// so it will suggest the next word.
boolean numberOfWordsIsEqual = (numberOfWordsFromQuery == numberOfWordsFromAutoCompleteItem);

// Case: If the first word from the suggestion is a stop word, it will not be
// added to the list.
// Example: Query: "The_" suggestions = ["The World", "The office"] Filtered
// suggestions = []
boolean firstWordIsStopWord = stopWords.contains(suggestionTokens[0]);

// Case: If the last word from the suggestion is a stop word, it will not be
// added to the list.
// Example: Query: "Hello_" suggestions = ["Hello my", "Hello world"] Filtered
// suggestions = ["Hello world"]
boolean lastWordIsStopWord = stopWords.contains(suggestionTokens[suggestionTokens.length - 1]);

// Disable the use of terms query equals auto complete
// Example: Query: "Hell" suggestions = ["Hello", "Hello world", "Help"]
// Filtered suggestions = ["Hello", "Hello world", "Help"]
numberOfWordsIsEqual = numberOfWordsIsEqual || !this.useTermsQueryEqualsAutoComplete;

return (numberOfWordsIsEqual && !firstWordIsStopWord && !lastWordIsStopWord);
}

private void validateDefaultStrategyConfig() {
if (this.stopWords == null) {
throw new IllegalArgumentException("Stop words list is not defined.");
}
if (this.numberOfWordsFromQuery == 0) {
throw new IllegalArgumentException("Number of words from query is not defined.");
}
}

private enum SuggestionFilterStrategy {
DEFAULT,
AUTOMATON
}
}
Loading

0 comments on commit 3413263

Please sign in to comment.