diff --git a/turing-app/src/main/java/com/viglet/turing/se/TurSEStopWord.java b/turing-app/src/main/java/com/viglet/turing/se/TurSEStopWord.java index aa58486d88..fc55fea509 100644 --- a/turing-app/src/main/java/com/viglet/turing/se/TurSEStopWord.java +++ b/turing-app/src/main/java/com/viglet/turing/se/TurSEStopWord.java @@ -57,7 +57,7 @@ public TurSEStopWord(ResourceLoader resourceloader) { private static final String STOP_WORD_CLASS_FILTER = "solr.StopFilterFactory"; private static final String WORDS_ATTRIBUTE = "words"; private static final String DEFAULT_STOP_WORD_FILE = "classpath:/solr/conf/lang/stopwords.txt"; - private static final String APPLICATION_OCTET_STREAM_UTF8 = "application/octet-stream;charset:utf-8"; + private static final String APPLICATION_OCTET_STREAM_UTF8 = "application/octet-stream;charset=utf-8"; private static final String ADMIN_FILE_URL = "%s/admin/file?contentType=%s&file=%s"; public List getStopWords(TurSolrInstance turSolrInstance) { diff --git a/turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionAutomaton.java b/turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionAutomaton.java new file mode 100644 index 0000000000..54a3ae924d --- /dev/null +++ b/turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionAutomaton.java @@ -0,0 +1,175 @@ +package com.viglet.turing.sn.ac; + +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.jetbrains.annotations.NotNull; + +import java.util.*; + +/** + * The SuggestionAutomaton class represents a finite state machine used to + * validate suggestions. + *

+ * Methods: + *

+ * + * + * + * + * @author Gabriel F. Gomazako + * @since 0.3.9 + */ +@Slf4j +public class SuggestionAutomaton { + private State initialState; + + public SuggestionAutomaton() { + buildAutomaton(); + } + + /** + * This constructs the finite state machine structure and binds to initialState parameter. + */ + private void buildAutomaton() { + State n0 = new State("N0"); + State n1 = new State("N1"); + State n2 = new State("N2"); + State n3 = new State("N3"); + State accept = new State("Accept", State.StateType.ACCEPT); + State reject = new State("Error", State.StateType.REJECT); + + n0.transitions.put(TokenType.WORD, n1); + n0.transitions.put(TokenType.STOP_WORD, reject); + n0.transitions.put(TokenType.EMPTY, reject); + n0.transitions.put(TokenType.SPECIAL_STOP_WORD, n2); + + n1.transitions.put(TokenType.WORD, reject); + n1.transitions.put(TokenType.EMPTY, accept); + n1.transitions.put(TokenType.STOP_WORD, reject); + n1.transitions.put(TokenType.SPECIAL_STOP_WORD, n2); + + n2.transitions.put(TokenType.WORD, n3); + n2.transitions.put(TokenType.STOP_WORD, n2); + n2.transitions.put(TokenType.EMPTY, reject); + n2.transitions.put(TokenType.SPECIAL_STOP_WORD, n2); + + n3.transitions.put(TokenType.WORD, reject); + n3.transitions.put(TokenType.STOP_WORD, reject); + n3.transitions.put(TokenType.EMPTY, accept); + n3.transitions.put(TokenType.SPECIAL_STOP_WORD, reject); + + this.initialState = n0; + } + + /** + * Runs the suggestion automaton to determine if a given suggestion is valid. + * + * @param suggestion the suggestion string to be evaluated. + * @param numberOfWordsFromQuery the number of words from current query. It will be used to know how many words the suggestion should have. + * @param stopWords a list of stop words. + * @return {@code true} if the suggestion is valid according to the automaton rules, {@code false} otherwise. + */ + public boolean run(String suggestion, int numberOfWordsFromQuery, List stopWords) { + // TOP -> [ "Hello", "World" ] + Deque tokensDeque = new ArrayDeque<>(List.of(suggestion.split(" "))); + + // Suggestions should not start with a stop word when is the first term of the query. + if (stopWords.contains(tokensDeque.peek()) && numberOfWordsFromQuery == 1) { + return false; + } + + // The suggestions will always include the query, so we need to ignore it. + int wordsToRemove = numberOfWordsFromQuery - 1; + // Query: "Hello my friend" -> numberOfWordsFromQuery = 3 + // Query: "Hello my friend " -> numberOfWordsFromQuery = 4 + while (wordsToRemove > 0 && !tokensDeque.isEmpty()) { + tokensDeque.pop(); + wordsToRemove--; + } + + // Checks if now it starts with a stop word + boolean firstTokenIsStopWord = stopWords.contains(tokensDeque.peek()); + + if (tokensDeque.isEmpty()) { + log.warn("Suggestion is empty."); + return false; + } + + TokenType currentTokenType = null; + State currentState = this.initialState; + log.info("Testing suggestion: {}", suggestion); + String currentToken = null; + while (true) { + if (currentState.stateType == State.StateType.REJECT) { + return false; + } else if (currentState.stateType == State.StateType.ACCEPT) { + return true; + } + + currentToken = tokensDeque.poll(); + currentTokenType = getTokenType(stopWords, currentToken, firstTokenIsStopWord); + + log.info("Current token: {} - Type: {}", currentToken, currentTokenType); + log.info("Current state: {}", currentState.name); + + currentState = currentState.getNextState(currentTokenType); + } + } + + @NotNull + private TokenType getTokenType(List stopWords, String currentToken, boolean firstTokenIsStopWord) { + TokenType tokenType; + if (StringUtils.isEmpty(currentToken)) { + return TokenType.EMPTY; + } + if (stopWords.contains(currentToken)) { + tokenType = TokenType.STOP_WORD; + if (firstTokenIsStopWord) + tokenType = TokenType.SPECIAL_STOP_WORD; + } else { + tokenType = TokenType.WORD; + } + return tokenType; + } + + private enum TokenType { + WORD, + STOP_WORD, + SPECIAL_STOP_WORD, + EMPTY + } + + private class State { + private Map transitions = new EnumMap<>(TokenType.class); + private StateType stateType; + private String name; + + private State(String name) { + this.stateType = StateType.NORMAL; + this.name = name; + } + + private State(String name, StateType stateType) { + this.stateType = stateType; + this.name = name; + } + + private State getNextState(TokenType tokenType) { + return this.transitions.get(tokenType); + } + + private enum StateType { + ACCEPT, + REJECT, + NORMAL + } + } + +} \ No newline at end of file diff --git a/turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionFilter.java b/turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionFilter.java new file mode 100644 index 0000000000..0afbe1e068 --- /dev/null +++ b/turing-app/src/main/java/com/viglet/turing/sn/ac/SuggestionFilter.java @@ -0,0 +1,171 @@ +package com.viglet.turing.sn.ac; + +import lombok.extern.slf4j.Slf4j; + +import java.util.*; + +/** + * The {@code SuggestionFilter} class is responsible for filtering a list of + * suggestions + * based on a specified strategy and a list of stop words. It currently supports + * two + * strategies: {@code DEFAULT} and {@code AUTOMATON}. + *

+ *

    + *
  • The {@code DEFAULT} strategy follows the original filter implemented in + * "TurSNAutoComplete".
  • + *
  • The {@code AUTOMATON} strategy uses a finite state machine to filter + * suggestions. The behaviour of this strategy filters + * suggestion for a single word, if the query has an space at the end, it will + * consider the query as a two-word query and will suggests + * a next word for the query. It will consider a stop word followed by a + * non-stop as a valid suggestion.
  • + *
+ * + * @author Gabriel F. Gomazako + * @since 0.3.9 + */ +@Slf4j +public class SuggestionFilter { + + private static final String SPACE_CHAR = " "; + private final List stopWords; + private int numberOfWordsFromQuery = 0; + private SuggestionFilterStrategy strategy; + private boolean useTermsQueryEqualsAutoComplete = true; + + public SuggestionFilter(List stopWords) { + this.stopWords = stopWords; + } + + /** + * Configures the suggestion filter to use the default strategy - Legacy + * strategy. + * + * @param numberOfWordsFromQuery the number of words from the query to be + * considered in the suggestion filter. + */ + public void defaultStrategyConfig(int numberOfWordsFromQuery) { + this.strategy = SuggestionFilterStrategy.DEFAULT; + this.numberOfWordsFromQuery = numberOfWordsFromQuery; + } + + /** + * Configures the default strategy for suggestions - Legacy strategy. + * + * @param numberOfWordsFromQuery the number of words to consider from + * the query. + * @param useTermsQueryEqualsAutoComplete flag indicating whether to use terms + * query equals auto-complete. + */ + public void defaultStrategyConfig(int numberOfWordsFromQuery, boolean useTermsQueryEqualsAutoComplete) { + this.defaultStrategyConfig(numberOfWordsFromQuery); + this.useTermsQueryEqualsAutoComplete = useTermsQueryEqualsAutoComplete; + } + + /** + * Configures the suggestion filter to use the automaton strategy - New + * strategy. + * + * @param numberOfWordsFromQuery the number of words from the query to be used + * in the automaton strategy + */ + public void automatonStrategyConfig(int numberOfWordsFromQuery) { + this.strategy = SuggestionFilterStrategy.AUTOMATON; + this.numberOfWordsFromQuery = numberOfWordsFromQuery; + } + + /** + * Filters a list of suggestions based on the defined strategy. + * + * @param suggestions the list of suggestions to be filtered + * @return a list of filtered suggestions based on the strategy + * @throws IllegalArgumentException if the suggestions list is null + */ + public List filter(List suggestions) { + if (suggestions == null) { + throw new IllegalArgumentException("Suggestions list is null."); + } + + List suggestionsFiltered = new ArrayList<>(); + switch (this.strategy) { + case DEFAULT: + for (String suggestion : suggestions) { + if (defaultStrategy(suggestion)) { + suggestionsFiltered.add(suggestion); + } + } + break; + case AUTOMATON: + SuggestionAutomaton automaton = new SuggestionAutomaton(); + for (String suggestion : suggestions) { + if (automaton.run(suggestion, numberOfWordsFromQuery, stopWords)) { + suggestionsFiltered.add(suggestion); + } + } + break; + default: + log.warn("No strategy defined. Returning empty list."); + return Collections.emptyList(); + } + + return suggestionsFiltered; + } + + public SuggestionFilterStrategy getStrategy() { + return this.strategy; + } + + public void setStrategy(SuggestionFilterStrategy strategy) { + this.strategy = strategy; + } + + private boolean defaultStrategy(String suggestion) { + validateDefaultStrategyConfig(); + + // Example: Query: "Hello" suggestion: "Hello World" suggestion.split = + // ["Hello", "World"].length = 2 + String[] suggestionTokens = suggestion.split(SPACE_CHAR); + int numberOfWordsFromAutoCompleteItem = suggestionTokens.length; + + // Case: Autocompletes the current word being typed. + // Example: Query: "Hel" suggestions = ["Hello", "Hello world", "help"] Filtered + // suggestions = ["Hello", "help"] + // if query ends with space, number of words from query will have an extra word. + // so it will suggest the next word. + boolean numberOfWordsIsEqual = (numberOfWordsFromQuery == numberOfWordsFromAutoCompleteItem); + + // Case: If the first word from the suggestion is a stop word, it will not be + // added to the list. + // Example: Query: "The_" suggestions = ["The World", "The office"] Filtered + // suggestions = [] + boolean firstWordIsStopWord = stopWords.contains(suggestionTokens[0]); + + // Case: If the last word from the suggestion is a stop word, it will not be + // added to the list. + // Example: Query: "Hello_" suggestions = ["Hello my", "Hello world"] Filtered + // suggestions = ["Hello world"] + boolean lastWordIsStopWord = stopWords.contains(suggestionTokens[suggestionTokens.length - 1]); + + // Disable the use of terms query equals auto complete + // Example: Query: "Hell" suggestions = ["Hello", "Hello world", "Help"] + // Filtered suggestions = ["Hello", "Hello world", "Help"] + numberOfWordsIsEqual = numberOfWordsIsEqual || !this.useTermsQueryEqualsAutoComplete; + + return (numberOfWordsIsEqual && !firstWordIsStopWord && !lastWordIsStopWord); + } + + private void validateDefaultStrategyConfig() { + if (this.stopWords == null) { + throw new IllegalArgumentException("Stop words list is not defined."); + } + if (this.numberOfWordsFromQuery == 0) { + throw new IllegalArgumentException("Number of words from query is not defined."); + } + } + + private enum SuggestionFilterStrategy { + DEFAULT, + AUTOMATON + } +} \ No newline at end of file diff --git a/turing-app/src/main/java/com/viglet/turing/sn/ac/TurSNAutoComplete.java b/turing-app/src/main/java/com/viglet/turing/sn/ac/TurSNAutoComplete.java index e46c0e14ce..c4b2de53b9 100644 --- a/turing-app/src/main/java/com/viglet/turing/sn/ac/TurSNAutoComplete.java +++ b/turing-app/src/main/java/com/viglet/turing/sn/ac/TurSNAutoComplete.java @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2022 the original author or authors. + * Copyright (C) 2016-2022 the original author or authors. * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -33,114 +33,75 @@ import java.util.Collections; import java.util.List; import java.util.Locale; -import java.util.stream.Collectors; + @Slf4j @Component public class TurSNAutoComplete { - private static final String SPACE_CHAR = " "; - private static final boolean USE_BIGGER_TERMS = false; - private static final boolean USE_TERMS_QUERY_EQUALS_AUTO_COMPLETE = true; - private static final boolean USE_REPEAT_QUERY_TEXT_ON_AUTOCOMPLETE = false; - private final TurSolr turSolr; - private final TurSEStopWord turSEStopword; - private final TurSolrInstanceProcess turSolrInstanceProcess; - - @Inject - public TurSNAutoComplete(TurSolr turSolr, TurSEStopWord turSEStopword, TurSolrInstanceProcess turSolrInstanceProcess) { - this.turSolr = turSolr; - this.turSEStopword = turSEStopword; - this.turSolrInstanceProcess = turSolrInstanceProcess; - } - - public List autoComplete(String siteName, String q, Locale locale, long rows) { - - if (q.length() > 1) { - return turSolrInstanceProcess.initSolrInstance(siteName, locale).map(instance -> { - SpellCheckResponse turSEResults = executeAutoCompleteFromSE(instance, q); - int numberOfWordsFromQuery = q.split(SPACE_CHAR).length; - if (q.endsWith(SPACE_CHAR)) { - numberOfWordsFromQuery++; - } - List autoCompleteListFormatted = createFormattedList(turSEResults, instance, - numberOfWordsFromQuery); - List autoCompleteListShrink = removeDuplicatedTerms(autoCompleteListFormatted, - numberOfWordsFromQuery, q); - return autoCompleteListShrink.stream().limit(rows).toList(); - }).orElse(Collections.emptyList()); - } else { - return Collections.emptyList(); - } - } - - - private SpellCheckResponse executeAutoCompleteFromSE(TurSolrInstance turSolrInstance, String q) { - SpellCheckResponse turSEResults = null; - try { - turSEResults = turSolr.autoComplete(turSolrInstance, q); - } catch (Exception e) { - log.error(e.getMessage(), e); - } - return turSEResults; - } + private static final String SPACE_CHAR = " "; + private final TurSolr turSolr; + private final TurSEStopWord turSEStopword; + private final TurSolrInstanceProcess turSolrInstanceProcess; - private List createFormattedList(SpellCheckResponse turSEResults, TurSolrInstance turSolrInstance, - int numberOfWordsFromQuery) { - List autoCompleteListFormatted = new ArrayList<>(); - if (hasSuggestions(turSEResults)) { - List autoCompleteList = turSEResults.getSuggestions().getFirst().getAlternatives(); - TurSNAutoCompleteListData autoCompleteListData = new TurSNAutoCompleteListData(turSEStopword.getStopWords(turSolrInstance)); - autoCompleteList.forEach(autoCompleteItem -> processTerm(numberOfWordsFromQuery, autoCompleteListFormatted, - autoCompleteListData, autoCompleteItem)); - } - return autoCompleteListFormatted; - } + @Inject + public TurSNAutoComplete(TurSolr turSolr, TurSEStopWord turSEStopword, TurSolrInstanceProcess turSolrInstanceProcess) { + this.turSolr = turSolr; + this.turSEStopword = turSEStopword; + this.turSolrInstanceProcess = turSolrInstanceProcess; + } - private void processTerm(int numberOfWordsFromQuery, List autoCompleteListFormatted, - TurSNAutoCompleteListData autoCompleteListData, String autoCompleteItem) { - if (isAddTermToList(autoCompleteListData, numberOfWordsFromQuery, autoCompleteItem)) { - autoCompleteListFormatted.add(autoCompleteItem); - } - } + public List autoComplete(String siteName, String q, Locale locale, long rows) { + // Only autocomplete if the query has more than one character + if (q.length() > 1) { + // Initialize Solr Instance + return turSolrInstanceProcess.initSolrInstance(siteName, locale).map(instance -> { + // Execute AutoComplete Solr API + SpellCheckResponse turSEResults = executeAutoCompleteFromSE(instance, q); + int numberOfWordsFromQuery = q.split(SPACE_CHAR).length; + // Daria para inferir se há espaço no final da query se fizermos um split mantendo os delimitadores e vendo + // se o tamanho do array é par. + if (q.endsWith(SPACE_CHAR)) { + numberOfWordsFromQuery++; + } - private boolean isAddTermToList(TurSNAutoCompleteListData autoCompleteListData, int numberOfWordsFromQuery, - String autoCompleteItem) { - String[] autoCompleteItemTokens = autoCompleteItem.split(SPACE_CHAR); - int numberOfWordsFromAutoCompleteItem = autoCompleteItemTokens.length; - String autoCompleteItemFirstToken = autoCompleteItemTokens[0]; - String autoCompleteItemLastToken = autoCompleteItemTokens[autoCompleteItemTokens.length - 1]; - boolean numberOfWordsIsEquals = numberOfWordsFromQuery == numberOfWordsFromAutoCompleteItem; - boolean firstWordIsStopword = autoCompleteListData.getStopWords().contains(autoCompleteItemFirstToken); - boolean lastWordIsStopword = autoCompleteListData.getStopWords().contains(autoCompleteItemLastToken); + List autoCompleteListFormatted = createFormattedList(turSEResults, instance, + numberOfWordsFromQuery); + return autoCompleteListFormatted.stream().limit(rows).toList(); + }).orElse(Collections.emptyList()); + } else { + return Collections.emptyList(); + } + } - return (!USE_TERMS_QUERY_EQUALS_AUTO_COMPLETE || numberOfWordsIsEquals) && !firstWordIsStopword && !lastWordIsStopword; - } - private boolean hasSuggestions(SpellCheckResponse turSEResults) { - return turSEResults != null && turSEResults.getSuggestions() != null - && !turSEResults.getSuggestions().isEmpty(); - } + private SpellCheckResponse executeAutoCompleteFromSE(TurSolrInstance turSolrInstance, String q) { + SpellCheckResponse turSEResults = null; + try { + turSEResults = turSolr.autoComplete(turSolrInstance, q); + } catch (Exception e) { + log.error(e.getMessage(), e); + } + return turSEResults; + } - @SuppressWarnings("unused") - private List removeDuplicatedTerms(List autoCompleteList, int numberOfWordsFromQuery, String termQuery) { - List autoCompleteWithoutDuplicated = autoCompleteList.stream().distinct().collect(Collectors.toList()); - if (USE_REPEAT_QUERY_TEXT_ON_AUTOCOMPLETE && autoCompleteWithoutDuplicated.isEmpty()) { - autoCompleteWithoutDuplicated.add(termQuery); - } - return USE_BIGGER_TERMS ? biggerTerms(autoCompleteWithoutDuplicated) : autoCompleteWithoutDuplicated; - } + private List createFormattedList(SpellCheckResponse turSEResults, TurSolrInstance turSolrInstance, + int numberOfWordsFromQuery) { + List autoCompleteListFormatted = new ArrayList<>(); + // if there are suggestions in the response. + if (hasSuggestions(turSEResults)) { + // autoCompleteList is the list of auto complete suggestions returned by Solr. + List autoCompleteList = turSEResults.getSuggestions().getFirst().getAlternatives(); + TurSNAutoCompleteListData autoCompleteListData = new TurSNAutoCompleteListData(turSEStopword.getStopWords(turSolrInstance)); - private List biggerTerms(List autoCompleteWithoutDuplicated) { - List autoCompleteOnlyBiggerTerms = new ArrayList<>(); - for (String term : autoCompleteWithoutDuplicated) { - List resultList = autoCompleteWithoutDuplicated.stream().filter(s -> s.startsWith(term)) - .toList(); - if (resultList.size() == 1) { - autoCompleteOnlyBiggerTerms.add(resultList.getFirst()); - } - } + SuggestionFilter suggestionFilter = new SuggestionFilter(autoCompleteListData.getStopWords()); + suggestionFilter.automatonStrategyConfig(numberOfWordsFromQuery); + autoCompleteListFormatted = suggestionFilter.filter(autoCompleteList); + } + return autoCompleteListFormatted; + } - return autoCompleteOnlyBiggerTerms; - } + private boolean hasSuggestions(SpellCheckResponse turSEResults) { + return turSEResults != null && turSEResults.getSuggestions() != null + && !turSEResults.getSuggestions().isEmpty(); + } - } diff --git a/turing-app/src/main/java/com/viglet/turing/solr/TurSolr.java b/turing-app/src/main/java/com/viglet/turing/solr/TurSolr.java index 99ca9fa2e7..eed2c021d5 100644 --- a/turing-app/src/main/java/com/viglet/turing/solr/TurSolr.java +++ b/turing-app/src/main/java/com/viglet/turing/solr/TurSolr.java @@ -284,6 +284,28 @@ public SolrDocumentList solrResultAnd(TurSolrInstance turSolrInstance, Map + * "spellcheck": { + * "suggestions": [ + * "batata", { + * "numFound": 49, + * "startOffset": 0, + * "endOffset": 8, + * "suggestion": [ + * "batata doce", + * "batata baroa", + * "batata baroa palitos", + * "batata baroa palitos de", + * ... + * ] + * } + * ] + * } + * + */ public SpellCheckResponse autoComplete(TurSolrInstance turSolrInstance, String term) { return executeSolrQuery(turSolrInstance, new SolrQuery().setRequestHandler(TUR_SUGGEST).setQuery(term)) .map(QueryResponse::getSpellCheckResponse).orElse(null);