Skip to content

Commit

Permalink
Added boost to rerank step (#360)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 authored Aug 31, 2023
1 parent 5b3abb4 commit ec4d0b8
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 18 deletions.
7 changes: 7 additions & 0 deletions backend/danswer/chunking/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from typing import cast

from danswer.configs.constants import BLURB
from danswer.configs.constants import BOOST
from danswer.configs.constants import METADATA
from danswer.configs.constants import SCORE
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
from danswer.connectors.models import Document
Expand Down Expand Up @@ -57,6 +59,8 @@ class InferenceChunk(BaseChunk):
document_id: str
source_type: str
semantic_identifier: str
boost: int
score: float | None
metadata: dict[str, Any]

@classmethod
Expand All @@ -78,6 +82,9 @@ def from_dict(cls, init_dict: dict[str, Any]) -> "InferenceChunk":
init_kwargs[METADATA] = json.loads(init_kwargs[METADATA])
else:
init_kwargs[METADATA] = {}
init_kwargs[BOOST] = init_kwargs.get(BOOST, 1)
if SCORE not in init_kwargs:
init_kwargs[SCORE] = None
if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
logger.error(
f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
Expand Down
1 change: 1 addition & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
PUBLIC_DOC_PAT = "PUBLIC"
QUOTE = "quote"
BOOST = "boost"
SCORE = "score"
DEFAULT_BOOST = 0


Expand Down
11 changes: 5 additions & 6 deletions backend/danswer/datastores/datastore_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
import uuid
from collections.abc import Callable
from copy import deepcopy
Expand All @@ -12,15 +13,13 @@


DEFAULT_BATCH_SIZE = 30
BOOST_MULTIPLIER = 1.2
BOOST_MULTIPLIER = 2 # Try to keep this consistent with Vespa


def translate_boost_count_to_multiplier(boost: int) -> float:
if boost > 0:
return BOOST_MULTIPLIER**boost
elif boost < 0:
return 1 / (BOOST_MULTIPLIER**boost)
return 1
# Sigmoid function, maxed out at BOOST_MULTIPLIER
# 3 here stretches it out so we hit asymptote slower
return BOOST_MULTIPLIER / (1 + math.exp(-1 * boost / 3))


def get_uuid_from_chunk(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ schema danswer_chunk {
field section_continuation type bool {
indexing: summary | attribute
}
# Technically this one should be int, but can't change without causing breaks to existing index
field boost type float {
indexing: summary | attribute
}
Expand Down Expand Up @@ -64,7 +65,7 @@ schema danswer_chunk {

rank-profile keyword_search inherits default {
first-phase {
expression: bm25(content) * attribute(boost)
expression: bm25(content) * (2 / (1 + exp(-attribute(boost) / 3)))
}
}

Expand All @@ -73,7 +74,8 @@ schema danswer_chunk {
query(query_embedding) tensor<float>(x[384])
}
first-phase {
expression: closeness(field, embeddings) * attribute(boost)
# Cannot do boost with the chosen embedding model because of high default similarity
expression: closeness(field, embeddings)
}
match-features: closest(embeddings)
}
Expand All @@ -83,10 +85,11 @@ schema danswer_chunk {
query(query_embedding) tensor<float>(x[384])
}
first-phase {
expression: bm25(content)
expression: bm25(content) * (2 / (1 + exp(-attribute(boost) / 3)))
}
second-phase {
expression: closeness(field, embeddings) * attribute(boost)
# Cannot do boost with the chosen embedding model because of high default similarity
expression: closeness(field, embeddings)
}
match-features: closest(embeddings)
}
Expand Down
13 changes: 9 additions & 4 deletions backend/danswer/datastores/vespa/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
from danswer.configs.constants import BOOST
from danswer.configs.constants import CHUNK_ID
from danswer.configs.constants import CONTENT
from danswer.configs.constants import DEFAULT_BOOST
from danswer.configs.constants import DOCUMENT_ID
from danswer.configs.constants import EMBEDDINGS
from danswer.configs.constants import METADATA
from danswer.configs.constants import PUBLIC_DOC_PAT
from danswer.configs.constants import SCORE
from danswer.configs.constants import SECTION_CONTINUATION
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
Expand Down Expand Up @@ -176,7 +178,7 @@ def _index_vespa_chunks(
SECTION_CONTINUATION: chunk.section_continuation,
METADATA: json.dumps(document.metadata),
EMBEDDINGS: embeddings_name_vector_map,
BOOST: 1, # Boost value always starts at 1 for 0 impact on weight
BOOST: DEFAULT_BOOST,
ALLOWED_USERS: cross_connector_document_metadata_map[
document.id
].allowed_users,
Expand Down Expand Up @@ -277,7 +279,10 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
response.raise_for_status()

hits = response.json()["root"].get("children", [])
inference_chunks = [InferenceChunk.from_dict(hit["fields"]) for hit in hits]
inference_chunks = [
InferenceChunk.from_dict(dict(hit["fields"], **{SCORE: hit["relevance"]}))
for hit in hits
]

return inference_chunks

Expand Down Expand Up @@ -342,9 +347,9 @@ def update(self, update_requests: list[UpdateRequest]) -> None:
continue

update_dict: dict[str, dict] = {"fields": {}}
if update_request.boost:
if update_request.boost is not None:
update_dict["fields"][BOOST] = {"assign": update_request.boost}
if update_request.allowed_users:
if update_request.allowed_users is not None:
update_dict["fields"][ALLOWED_USERS] = {
"assign": update_request.allowed_users
}
Expand Down
11 changes: 9 additions & 2 deletions backend/danswer/db/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from danswer.configs.constants import QAFeedbackType
from danswer.configs.constants import SearchFeedbackType
from danswer.datastores.datastore_utils import translate_boost_count_to_multiplier
from danswer.datastores.document_index import get_default_document_index
from danswer.datastores.interfaces import UpdateRequest
from danswer.db.models import Document as DbDocument
Expand Down Expand Up @@ -56,6 +55,14 @@ def update_document_boost(db_session: Session, document_id: str, boost: int) ->
raise ValueError(f"No document found with ID: '{document_id}'")

result.boost = boost

update = UpdateRequest(
document_ids=[document_id],
boost=boost,
)

get_default_document_index().update([update])

db_session.commit()


Expand Down Expand Up @@ -137,7 +144,7 @@ def create_doc_retrieval_feedback(
document_index = get_default_document_index()
update = UpdateRequest(
document_ids=[document_id],
boost=translate_boost_count_to_multiplier(doc_m.boost),
boost=doc_m.boost,
)
# Updates are generally batched for efficiency, this case only 1 doc/value is updated
document_index.update([update])
Expand Down
18 changes: 16 additions & 2 deletions backend/danswer/search/semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from danswer.configs.model_configs import ASYMMETRIC_PREFIX
from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
from danswer.datastores.datastore_utils import translate_boost_count_to_multiplier
from danswer.datastores.interfaces import DocumentIndex
from danswer.datastores.interfaces import IndexFilter
from danswer.search.models import Embedder
Expand All @@ -36,6 +37,8 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
link=chunk.source_links.get(0) if chunk.source_links else None,
blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
score=chunk.score,
)
# semantic identifier should always exist but for really old indices, it was not enforced
for chunk in chunks
Expand All @@ -57,13 +60,24 @@ def semantic_reranking(
encoder.predict([(query, chunk.content) for chunk in chunks]) # type: ignore
for encoder in cross_encoders
]
averaged_sim_scores = sum(sim_scores) / len(sim_scores)
scored_results = list(zip(averaged_sim_scores, chunks))

shifted_sim_scores = sum(
[enc_n_scores - numpy.min(enc_n_scores) for enc_n_scores in sim_scores]
) / len(sim_scores)

boosts = [translate_boost_count_to_multiplier(chunk.boost) for chunk in chunks]
boosted_sim_scores = shifted_sim_scores * boosts
scored_results = list(zip(boosted_sim_scores, chunks))
scored_results.sort(key=lambda x: x[0], reverse=True)
ranked_sim_scores, ranked_chunks = zip(*scored_results)

logger.debug(f"Reranked similarity scores: {ranked_sim_scores}")

# Assign new chunk scores based on reranking
# TODO if pagination is added, the scores won't make sense with respect to the non-reranked hits
for ind, chunk in enumerate(ranked_chunks):
chunk.score = ranked_sim_scores[ind]

return list(ranked_chunks)


Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/server/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class SearchDoc(BaseModel):
link: str | None
blurb: str
source_type: str
boost: int
score: float | None


class QuestionRequest(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def test_fuzzy_match_quotes_to_docs(self) -> None:
blurb="anything",
semantic_identifier="anything",
section_continuation=False,
boost=0,
score=1,
metadata={},
)
test_chunk_1 = InferenceChunk(
Expand All @@ -123,6 +125,8 @@ def test_fuzzy_match_quotes_to_docs(self) -> None:
blurb="whatever",
semantic_identifier="whatever",
section_continuation=False,
boost=0,
score=1,
metadata={},
)

Expand Down

1 comment on commit ec4d0b8

@vercel
Copy link

@vercel vercel bot commented on ec4d0b8 Aug 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.