google · kyolee415 · Oct 4, 2023 · Sep 29, 2023
@@ -487,15 +487,16 @@ To convert Markdown files to plain text files:
 
 ### 2. Populate a new vector database
 
-**Important**: If the `vector_stores/chroma` directory already exists, delete
-(or move) the `chroma` directory before populating a new vector database. Also,
-if the Docs Agent chat app is already running using this `chroma` directory, shut down
-the app before deleting the directory.
-
 Once you have plain text files processed and stored in the `output_path` directory,
 you can run the `populat_vector_database.py` script to populate a vector database
 with the contents of the plain text files and their embeddings (and metadata).
 
+**Important**: For a clean setup, if the `vector_stores/chroma` directory already
+exists, delete (or move) the `chroma` directory before populating a new vector
+database. (Otherwise, new entries will be added to your existing vector database.)
+Also, if the Docs Agent chat app is already running using this `chroma` directory,
+shut down the app before deleting the directory.
+
 To populate a new vector database:
 
 1. Go to the Docs Agent project directory, for example:

@@ -38,7 +38,7 @@ <h4 id="rewrite-question-header">Question:</h4>
   <span id="rewrite-question-span">
     <p>{{ question | replace("+", " ") | replace("%3F", "?")}}</p>
   </span>
-  <h4 id="rewrite-response-header">Bard's response:</h4>
+  <h4 id="rewrite-response-header">PaLM's response:</h4>
   <span id="rewrite-original-response-span">
     {{ response_in_html | safe }}
   </span>

@@ -42,12 +42,7 @@ class Chroma:
     """Chroma wrapper"""
 
     def __init__(self, chroma_dir) -> None:
-        self.client = chromadb.Client(
-            Settings(
-                chroma_db_impl="duckdb+parquet",
-                persist_directory=chroma_dir,
-            )
-        )
+        self.client = chromadb.PersistentClient(path=chroma_dir)
 
     def list_collections(self):
         return self.client.list_collections()

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docs-agent"
-version = "0.1.0"
+version = "0.1.1"
 description = ""
 authors = ["Docs Agent contributors"]
 readme = "README.md"
@@ -11,7 +11,7 @@ rich = "^13.3.5"
 Markdown = "^3.4.3"
 beautifulsoup4 = "^4.12.2"
 protobuf = ">=3.20"
-chromadb = "^0.3.21"
+chromadb = "==0.4.13"
 sentence-transformers = "^2.2.2"
 ratelimit = "^2.2.1"
 absl-py = "^1.4.0"
@@ -21,6 +21,7 @@ google-generativeai = "^0.1.0"
 grpcio = "^1.57.0"
 grpcio-tools = "^1.57.0"
 uuid = "^1.30"
+pytz = ">=2020.1"
 
 [tool.poetry.group.dev.dependencies]
 ipython = "^8.13.2"

@@ -153,6 +153,9 @@ def process_page_and_section_titles(markdown_text):
         page_title = data["title"]
         markdown_text = data.content
         metadata = data.metadata
+    if "URL" in data:
+        final_url = data["URL"]
+        metadata["URL"] = final_url
     for line in markdown_text.split("\n"):
         new_line = ""
         skip_this_line = False
@@ -173,7 +176,7 @@ def process_page_and_section_titles(markdown_text):
             # Detect Markdown heading levels
             if heading == "#":
                 page_title = captured_title.strip()
-                metadata = {"title": page_title}
+                metadata["title"] = page_title
                 subsection_title = ""
                 section_title = ""
             elif heading == "##":

@@ -108,10 +108,7 @@
     MODEL = os.path.join(BASE_DIR, "models/all-mpnet-base-v2")
     emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL)
 
-chroma_client = chromadb.Client(
-    Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_VECTOR_DB_DIR)
-)
-
+chroma_client = chromadb.PersistentClient(path=LOCAL_VECTOR_DB_DIR)
 
 # Create embed function for PaLM
 # API call limit to 5 qps
@@ -175,6 +172,8 @@ def embed_function(texts: Documents) -> Embeddings:
                 # Using the full path avoids mismatches
                 full_file_name = FULL_BASE_DIR + clean_filename + file
                 metadata_dict_extra = {}
+                # Flag to see if there is a predefined URL from frontmatter
+                final_url = False
                 # Reads the metadata associated with files
                 for key in index:
                     if full_file_name in index[key]:
@@ -197,6 +196,10 @@ def embed_function(texts: Documents) -> Embeddings:
                                 index[key][full_file_name]["metadata"], delimiter="_"
                             )
                             metadata_dict_extra = dict(metadata_dict_extra)
+                            # Extracts user specified URL
+                            if "URL" in metadata_dict_extra:
+                                final_url = True
+                                final_url_value = metadata_dict_extra["URL"]
                         else:
                             metadata_dict_extra = {}
                         if "UUID" in index[key][full_file_name]:
@@ -216,6 +219,9 @@ def embed_function(texts: Documents) -> Embeddings:
                 # Remove .md at the end of URLs by default.
                 match3 = re.search(r"(.*)\.md$", url)
                 url = match3[1]
+                # Replaces the URL if it comes from frontmatter
+                if (final_url):
+                    url = final_url_value
                 # Creates a dictionary with basic metadata values
                 # (i.e. source, URL, and md_hash)
                 metadata_dict_main = {
@@ -287,7 +293,6 @@ def embed_function(texts: Documents) -> Embeddings:
                     print("[Warning] Empty file!")
                 print("")
                 auto.close()
-chroma_client.persist()
 # results = collection.query(
 #     query_texts=["What are some differences between apples and oranges?"],
 #     n_results=3,

@@ -83,9 +83,7 @@ def embed_palm(texts: Documents) -> Embeddings:
 ai_console = Console(width=160)
 ai_console.rule("Fold")
 
-chroma_client = chromadb.Client(
-    Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_VECTOR_DB_DIR)
-)
+chroma_client = chromadb.PersistentClient(path=LOCAL_VECTOR_DB_DIR)
 
 if EMBEDDINGS_TYPE == "PALM":
     PALM_EMBEDDING_MODEL = "models/embedding-gecko-001"