Skip to content

Commit

Permalink
create filter_text_terms function
Browse files Browse the repository at this point in the history
  • Loading branch information
edulauer committed Apr 19, 2024
1 parent a91caee commit 8f35bc8
Showing 1 changed file with 24 additions and 5 deletions.
29 changes: 24 additions & 5 deletions src/hooks/inlabs_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,27 @@ class TextDictHandler:
def __init__(self, *args, **kwargs):
pass

@staticmethod
def filter_text_terms(text_terms) -> list:
"""
Filter the text terms by removing words that succeed the delimitator ! and
split words based on search term operators
Args:
text_terms (list): The list of text terms used in the search.
Returns:
list: A list of filtered text terms
"""
search_term_operators = ['&', '!', '|']

text_terms = [re.sub(r"! .*", "", term).strip() for term in text_terms]
operator_str = ''.join(search_term_operators)
split_text_terms = [re.split(rf'\s*[{re.escape(operator_str)}]\s*', term) for term in text_terms]
text_terms = [item for sublist in split_text_terms for item in sublist]

return text_terms

def transform_search_results(
self,
response: pd.DataFrame,
Expand All @@ -237,10 +258,8 @@ def transform_search_results(
Returns:
dict: A dictionary of sorted and processed search results.
"""
term_operators = ['&', '!']
operator_str = ''.join(term_operators)
split_text_terms = [re.split(rf'\s*[{re.escape(operator_str)}]\s*', term) for term in text_terms]
text_terms = [item for sublist in split_text_terms for item in sublist]
# Remove the words that suceeds the delimitator !
filtered_text_terms = self.filter_text_terms(text_terms)

df = response.copy()
# `identifica` column is the publication title. If None
Expand All @@ -251,7 +270,7 @@ def transform_search_results(
df["identifica"] = df["identifica"].apply(self._remove_html_tags)
df["pubdate"] = df["pubdate"].dt.strftime("%d/%m/%Y")
df["texto"] = df["texto"].apply(self._remove_html_tags)
df["matches"] = df["texto"].apply(self._find_matches, keys=text_terms)
df["matches"] = df["texto"].apply(self._find_matches, keys=filtered_text_terms)
df["matches_assina"] = df.apply(
lambda row: self._normalize(row["matches"])
in self._normalize(row["assina"]),
Expand Down

0 comments on commit 8f35bc8

Please sign in to comment.