Skip to content

Commit

Permalink
Schema and default weight changes
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 committed Jul 30, 2024
1 parent 96b5820 commit 3909fba
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 59 deletions.
4 changes: 2 additions & 2 deletions backend/danswer/configs/chat_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@
else:
EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
# Weighting factor between Vector and Keyword Search, 1 for completely vector search
HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.62)))
HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.5)))
# Weighting factor between Title and Content of documents during search, 1 for completely
# Title based. Default heavily favors Content because Title is also included at the top of
# Content. This is to avoid cases where the Content is very relevant but it may not be clear
# if the title is separated out. Title is most of a "boost" than a separate field.
TITLE_CONTENT_RATIO = max(
0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20))
0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.10))
)
# A list of languages passed to the LLM to rephase the query
# For example "English,French,Spanish", be sure to use the "," separator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,10 @@ schema DANSWER_CHUNK_NAME {
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
indexing: summary | index | attribute
match {
gram
gram-size: 3
}
index: enable-bm25
}
field content type string {
indexing: summary | index
match {
gram
gram-size: 3
}
index: enable-bm25
}
# duplication of `content` is far from ideal, but is needed for
Expand Down Expand Up @@ -153,43 +145,45 @@ schema DANSWER_CHUNK_NAME {
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
}

# This must be separate function for normalize_linear to work
function vector_score() {
expression {
# If no title, the full vector score comes from the content embedding
(query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
((1 - query(title_content_ratio)) * closeness(field, embeddings))
}
}

# This must be separate function for normalize_linear to work
function keyword_score() {
function title_vector_score() {
expression {
(query(title_content_ratio) * bm25(title)) +
((1 - query(title_content_ratio)) * bm25(content))
# If no good matching titles, then it should use the context embeddings rather than having some
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
# matching content score getting the full score
max(closeness(field, embeddings), closeness(field, title_embedding))
}
}

# First phase must be vector to allow hits that have no keyword matches
first-phase {
expression: vector_score
expression: closeness(field, embeddings)
}

# Weighted average between Vector Search and BM-25
# Each is a weighted average between the Title and Content fields
# Finally each doc is boosted by it's user feedback based boost and recency
# If any embedding or index field is missing, it just receives a score of 0
# Assumptions:
# - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
# therefore not normalizing before combining.
# - For documents without title, it gets a score of 0 for that and this is ok as documents
# without any title match should be penalized.
global-phase {
expression {
(
# Weighted Vector Similarity Score
(query(alpha) * normalize_linear(vector_score)) +
(
query(alpha) * (
(query(title_content_ratio) * normalize_linear(title_vector_score))
+
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
)
)

+

# Weighted Keyword Similarity Score
((1 - query(alpha)) * normalize_linear(keyword_score))
# Note: for the BM25 Title score, it requires decent stopword removal in the query
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
(
(1 - query(alpha)) * (
(query(title_content_ratio) * normalize_linear(bm25(title)))
+
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
)
)
)
# Boost based on user feedback
* document_boost
Expand All @@ -204,8 +198,6 @@ schema DANSWER_CHUNK_NAME {
bm25(content)
closeness(field, title_embedding)
closeness(field, embeddings)
keyword_score
vector_score
document_boost
recency_bias
closest(embeddings)
Expand All @@ -220,27 +212,4 @@ schema DANSWER_CHUNK_NAME {
}
}

# THE ONES BELOW ARE OUT OF DATE, DO NOT USE
# THEY MIGHT NOT EVEN WORK AT ALL
rank-profile keyword_search inherits default, default_rank {
first-phase {
expression: bm25(content) * document_boost * recency_bias
}

match-features: recency_bias document_boost bm25(content)
}

rank-profile semantic_searchVARIABLE_DIM inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
}

first-phase {
# Cannot do boost with the chosen embedding model because of high default similarity
# This depends on the embedding model chosen
expression: closeness(field, embeddings)
}

match-features: recency_bias document_boost closest(embeddings)
}
}

0 comments on commit 3909fba

Please sign in to comment.