Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

funders: tune search boost for acronyms #404

Merged
merged 1 commit into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions invenio_vocabularies/contrib/funders/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""Vocabulary funders configuration."""

from flask import current_app
from invenio_i18n import get_locale
from invenio_i18n import lazy_gettext as _
from invenio_records_resources.services import SearchOptions
from invenio_records_resources.services.records.components import DataComponent
Expand All @@ -22,6 +23,7 @@
funder_fundref_doi_prefix = LocalProxy(
lambda: current_app.config["VOCABULARIES_FUNDER_DOI_PREFIX"]
)
localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")


class FundersSearchOptions(SearchOptions):
Expand All @@ -30,9 +32,12 @@ class FundersSearchOptions(SearchOptions):
suggest_parser_cls = SuggestQueryParser.factory(
fields=[
"name^100",
"acronym.keyword^100",
"acronym^40",
localized_title,
"id^20",
"aliases^20",
"identifiers.identifier^10",
"acronym^10",
"aliases^10",
],
type="most_fields", # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
fuzziness="AUTO", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@
]
}
},
"normalizer": {
"accent_normalizer": {
"type": "custom",
"char_filter": ["strip_special_chars"],
"filter": [
"lowercase",
"asciifolding"
]
}
},
"filter": {
"lowercase": {
"type": "lowercase",
Expand Down Expand Up @@ -112,7 +122,13 @@
"acronym": {
"type": "text",
"analyzer": "accent_edge_analyzer",
"search_analyzer": "accent_analyzer"
"search_analyzer": "accent_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"normalizer": "accent_normalizer"
}
}
},
"status": {
"type": "keyword"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@
]
}
},
"normalizer": {
"accent_normalizer": {
"type": "custom",
"char_filter": ["strip_special_chars"],
"filter": [
"lowercase",
"asciifolding"
]
}
},
"filter": {
"lowercase": {
"type": "lowercase",
Expand Down Expand Up @@ -112,7 +122,13 @@
"acronym": {
"type": "text",
"analyzer": "accent_edge_analyzer",
"search_analyzer": "accent_analyzer"
"search_analyzer": "accent_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"normalizer": "accent_normalizer"
}
}
},
"status": {
"type": "keyword"
Expand Down
5 changes: 4 additions & 1 deletion tests/contrib/funders/test_funders_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,12 @@ def test_funders_suggest_sort(client, h, prefix, example_funders):
# Should show 2 results, and id=cern as first due to name
res = client.get(f"{prefix}?suggest=CERN", headers=h)
assert res.status_code == 200
assert res.json["hits"]["total"] == 2 # should be 2
assert res.json["hits"]["total"] == 3
assert res.json["hits"]["hits"][0]["name"] == "CERN"
assert res.json["hits"]["hits"][1]["name"] == "CERT"
# Matches lower, since title is boosted less
assert res.json["hits"]["hits"][2]["name"] == "OTHER"
assert res.json["hits"]["hits"][2]["title"]["en"] == "CERN"

res = client.get(f"{prefix}?suggest=N%C3%B5rthw%C3%AAst", headers=h) # Nõrthwêst
assert res.status_code == 200
Expand Down