Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feed in docs till we reach a token limit #401

Merged
merged 9 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backend/danswer/configs/app_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,12 @@
#####
NUM_RETURNED_HITS = 50
NUM_RERANKED_RESULTS = 15
NUM_GENERATIVE_AI_INPUT_DOCS = 5
# We feed in document chunks until we reach this token limit.
# Default is ~5 full chunks (max chunk size is 2000 chars), although some chunks
# may be smaller which could result in passing in more total chunks
NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL = int(
os.environ.get("NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL") or (512 * 5)
)
# 1 edit per 2 characters, currently unused due to fuzzy match being too slow
QUOTE_ALLOWED_ERROR_PERCENT = 0.05
QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "10") # 10 seconds
Expand Down
22 changes: 12 additions & 10 deletions backend/danswer/direct_qa/answer_question.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS
from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
from danswer.configs.app_configs import QA_TIMEOUT
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.datastores.document_index import get_default_document_index
Expand All @@ -11,6 +11,7 @@
from danswer.direct_qa.exceptions import OpenAIKeyMissing
from danswer.direct_qa.exceptions import UnknownModelError
from danswer.direct_qa.llm_utils import get_default_qa_model
from danswer.direct_qa.qa_utils import get_usable_chunks
from danswer.search.danswer_helper import query_intent
from danswer.search.keyword_search import retrieve_keyword_documents
from danswer.search.models import QueryFlow
Expand Down Expand Up @@ -107,18 +108,19 @@ def answer_qa_query(
chunk for chunk in ranked_chunks if chunk.metadata.get(IGNORE_FOR_QA)
]

chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS
if chunk_offset >= len(filtered_ranked_chunks):
raise ValueError("Chunks offset too large, should not retry this many times")
# get all chunks that fit into the token limit
usable_chunks = get_usable_chunks(
chunks=filtered_ranked_chunks,
token_limit=NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL,
offset=offset_count,
)
logger.debug(
f"Chunks fed to LLM: {[chunk.semantic_identifier for chunk in usable_chunks]}"
)

error_msg = None
try:
answer, quotes = qa_model.answer_question(
query,
filtered_ranked_chunks[
chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS
],
)
answer, quotes = qa_model.answer_question(query, usable_chunks)
except Exception as e:
# exception is logged in the answer_question method, no need to re-log
answer, quotes = None, None
Expand Down
47 changes: 47 additions & 0 deletions backend/danswer/direct_qa/qa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import regex

from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT
from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY
from danswer.configs.model_configs import GEN_AI_API_KEY
Expand All @@ -21,6 +22,7 @@
from danswer.direct_qa.qa_prompts import QUOTE_PAT
from danswer.direct_qa.qa_prompts import UNCERTAINTY_PAT
from danswer.dynamic_configs import get_dynamic_config_store
from danswer.llm.utils import check_number_of_tokens
from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import clean_model_quote
from danswer.utils.text_processing import shared_precompare_cleanup
Expand Down Expand Up @@ -254,3 +256,48 @@ def simulate_streaming_response(model_out: str) -> Generator[str, None, None]:
"""Mock streaming by generating the passed in model output, character by character"""
for token in model_out:
yield token


def _get_usable_chunks(
chunks: list[InferenceChunk], token_limit: int
) -> list[InferenceChunk]:
total_token_count = 0
usable_chunks = []
for chunk in chunks:
chunk_token_count = check_number_of_tokens(chunk.content)
if total_token_count + chunk_token_count > token_limit:
break

total_token_count += chunk_token_count
usable_chunks.append(chunk)

# try and return at least one chunk if possible. This chunk will
# get truncated later on in the pipeline. This would only occur if
# the first chunk is larger than the token limit (usually due to character
# count -> token count mismatches caused by special characters / non-ascii
# languages)
if not usable_chunks and chunks:
usable_chunks = [chunks[0]]

return usable_chunks


def get_usable_chunks(
chunks: list[InferenceChunk],
token_limit: int = NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL,
offset: int = 0,
) -> list[InferenceChunk]:
offset_into_chunks = 0
usable_chunks: list[InferenceChunk] = []
for _ in range(min(offset + 1, 1)): # go through this process at least once
if offset_into_chunks >= len(chunks) and offset_into_chunks > 0:
raise ValueError(
"Chunks offset too large, should not retry this many times"
)

usable_chunks = _get_usable_chunks(
chunks=chunks[offset_into_chunks:], token_limit=token_limit
)
offset_into_chunks += len(usable_chunks)

return usable_chunks
16 changes: 16 additions & 0 deletions backend/danswer/llm/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from collections.abc import Callable
from collections.abc import Iterator

import tiktoken
from langchain.prompts.base import StringPromptValue
from langchain.prompts.chat import ChatPromptValue
from langchain.schema import PromptValue
Expand Down Expand Up @@ -69,3 +71,17 @@ def convert_input(lm_input: LanguageModelInput) -> str:

def should_be_verbose() -> bool:
return LOG_LEVEL == "debug"


def check_number_of_tokens(
text: str, encode_fn: Callable[[str], list] | None = None
) -> int:
"""Get's the number of tokens in the provided text, using the provided encoding
function. If none is provided, default to the tiktoken encoder used by GPT-3.5
and GPT-4.
"""

if encode_fn is None:
encode_fn = tiktoken.get_encoding("cl100k_base").encode

return len(encode_fn(text))
23 changes: 13 additions & 10 deletions backend/danswer/server/search_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from danswer.auth.users import current_user
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
from danswer.configs.app_configs import NUM_GENERATIVE_AI_INPUT_DOCS
from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.datastores.document_index import get_default_document_index
from danswer.db.engine import get_session
Expand All @@ -22,6 +22,7 @@
from danswer.direct_qa.exceptions import UnknownModelError
from danswer.direct_qa.interfaces import DanswerAnswerPiece
from danswer.direct_qa.llm_utils import get_default_qa_model
from danswer.direct_qa.qa_utils import get_usable_chunks
from danswer.search.danswer_helper import query_intent
from danswer.search.danswer_helper import recommend_search_flow
from danswer.search.keyword_search import retrieve_keyword_documents
Expand Down Expand Up @@ -247,17 +248,19 @@ def stream_qa_portions(
chunk for chunk in ranked_chunks if not chunk.metadata.get(IGNORE_FOR_QA)
]

chunk_offset = offset_count * NUM_GENERATIVE_AI_INPUT_DOCS
if chunk_offset >= len(filtered_ranked_chunks):
raise ValueError(
"Chunks offset too large, should not retry this many times"
)
# get all chunks that fit into the token limit
usable_chunks = get_usable_chunks(
chunks=filtered_ranked_chunks,
token_limit=NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL,
offset=offset_count,
)
logger.debug(
f"Chunks fed to LLM: {[chunk.semantic_identifier for chunk in usable_chunks]}"
)

try:
for response_packet in qa_model.answer_question_stream(
query,
filtered_ranked_chunks[
chunk_offset : chunk_offset + NUM_GENERATIVE_AI_INPUT_DOCS
],
query, usable_chunks
):
if response_packet is None:
continue
Expand Down
1 change: 1 addition & 0 deletions deployment/docker_compose/docker-compose.dev.legacy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ services:
- GEN_AI_API_KEY=${GEN_AI_API_KEY:-}
- GEN_AI_ENDPOINT=${GEN_AI_ENDPOINT:-}
- GEN_AI_HOST_TYPE=${GEN_AI_HOST_TYPE:-}
- NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL=${NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL:-}
- POSTGRES_HOST=relational_db
- QDRANT_HOST=vector_db
- TYPESENSE_HOST=search_engine
Expand Down
1 change: 1 addition & 0 deletions deployment/docker_compose/docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ services:
- GEN_AI_API_KEY=${GEN_AI_API_KEY:-}
- GEN_AI_ENDPOINT=${GEN_AI_ENDPOINT:-}
- GEN_AI_HOST_TYPE=${GEN_AI_HOST_TYPE:-}
- NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL=${NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL:-}
- POSTGRES_HOST=relational_db
- VESPA_HOST=index
- LOG_LEVEL=${LOG_LEVEL:-info}
Expand Down
Loading