Merge pull request #222 from Cloud-Code-AI/10-add-rag-support

[WIP ] : 10 add rag support
Cloud-Code-AI · Aug 17, 2024 · 9165fd5 · 9165fd5
2 parents 77de3aa + b913aeb
commit 9165fd5
Show file tree

Hide file tree

Showing 20 changed files with 2,280 additions and 52 deletions.
diff --git a/.env.example b/.env.example
@@ -5,30 +5,36 @@ GITHUB_APP_WEBHOOK_SECRET=
 LOG_LEVEL="info"
 LITELLM_LOG="ERROR"
 
-# LLM SETUPS
-## OPENAI SETUP
+# LLM SETUPS [SETUP one of the below]
+## 1 - OPENAI SETUP
 OPENAI_API_KEY=
 OPENAI_ORGANIZATION=
 ### [OPTIONAL]
 OPENAI_API_BASE=
 
-## Azure
+## 2 - Azure 
 AZURE_API_KEY=
 AZURE_API_BASE=
 AZURE_API_VERSION=
 
-
-## Anthropic
+## 3 - Anthropic
 ANTHROPIC_API_KEY=
 
-## Google Gemini
+## 4 - Google Gemini
 GEMINI_API_KEY=
 
-## LITELLM OBSERVABILITY
+## LITELLM OBSERVABILITY [OPTIONAL]
 SUPABASE_URL=
 SUPABASE_KEY=
 
+
+## POSTGRES Setup [Necessary for RAG]
+POSTGRES_USER=
+POSTGRES_PASSWORD=
+POSTGRES_DB=
+
 ### REDIS SETUP
 REDIS_PASSWORD=
 REDIS_HOST=localhost
-REDIS_PORT=6379
+REDIS_PORT=6379
+
diff --git a/.gitignore b/.gitignore
@@ -164,4 +164,5 @@ cython_debug/
 node_modules
 .next
 
-.cloudcode
+.cloudcode
+tree_sitter_languages/
diff --git a/Dockerfile b/Dockerfile
@@ -4,6 +4,12 @@ FROM python:3.12-slim
 # Set the working directory in the container
 WORKDIR /app
 
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
 # Install Poetry
 RUN pip install --no-cache-dir poetry
 
@@ -16,6 +22,12 @@ RUN poetry install --no-dev --no-root
 # Copy the application code into the container
 COPY . .
 
+# Make the installation script executable
+RUN chmod +x install_tree_sitter_languages.sh
+
+# Run the Tree-sitter language installation script
+RUN ./install_tree_sitter_languages.sh
+
 # Expose the port on which the application will run
 EXPOSE 8000
 

diff --git a/Dockerfile-postgres b/Dockerfile-postgres
@@ -0,0 +1,18 @@
+FROM postgres:16-bullseye
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    postgresql-server-dev-16
+
+# Clone and install pgvector
+RUN git clone https://github.com/pgvector/pgvector.git \
+    && cd pgvector \
+    && make \
+    && make install
+
+# Clean up
+RUN apt-get remove -y build-essential git postgresql-server-dev-16 \
+    && apt-get autoremove -y \
+    && rm -rf /var/lib/apt/lists/* /pgvector
diff --git a/config.json b/config.json
@@ -4,6 +4,27 @@
         "enable_observability_logging": false,
         "redis_enabled": true,
         "models": [
+            {
+                "model_name": "embedding",
+                "litellm_params": {
+                    "model": "azure/text-embedding-small",
+                    "input_cost_per_token": 0.000000015,
+                    "output_cost_per_token": 0.0000006,
+                    "api_key": "os.environ/AZURE_API_KEY",
+                    "api_base": "os.environ/AZURE_API_BASE"
+                }
+            },
+            {
+                "model_name": "small",
+                "litellm_params": {
+                    "model": "azure/gpt-4o-mini",
+                    "input_cost_per_token": 0.000000015,
+                    "output_cost_per_token": 0.0000006,
+                    "api_key": "os.environ/AZURE_API_KEY",
+                    "api_base": "os.environ/AZURE_API_BASE",
+                    "base_model": "azure/gpt-4o-mini"
+                }
+            },
             {
                 "model_name": "default",
                 "litellm_params": {

diff --git a/db_setup/init.sql b/db_setup/init.sql
@@ -0,0 +1,81 @@
+-- Enable vector extension
+CREATE EXTENSION IF NOT EXISTS vector;
+
+CREATE TABLE repositories (
+    repo_id SERIAL PRIMARY KEY,
+    repo_name TEXT NOT NULL,
+    repo_owner TEXT NOT NULL,
+    repo_url TEXT NOT NULL,
+    repo_description TEXT,
+    CONSTRAINT unique_repo UNIQUE (repo_name, repo_owner)
+);
+
+-- Table to store file information
+CREATE TABLE files (
+    file_id SERIAL PRIMARY KEY,
+    repo_id INTEGER NOT NULL REFERENCES repositories(repo_id),
+    file_path TEXT NOT NULL,
+    file_name TEXT NOT NULL,
+    file_ext TEXT NOT NULL,
+    programming_language TEXT,
+    CONSTRAINT unique_repo_file UNIQUE (repo_id, file_path)
+);
+
+-- Table to store function abstractions
+CREATE TABLE function_abstractions (
+    function_id SERIAL PRIMARY KEY,
+    file_id INTEGER NOT NULL REFERENCES files(file_id),
+    function_name TEXT NOT NULL,
+    function_signature TEXT NOT NULL,
+    abstract_functionality TEXT NOT NULL,
+    complexity_score FLOAT,
+    input_output_description TEXT,
+    start_line INTEGER NOT NULL,
+    end_line INTEGER NOT NULL
+);
+
+-- Table to store vector embeddings for function abstractions
+CREATE TABLE function_embeddings (
+    embedding_id SERIAL PRIMARY KEY,
+    function_id INTEGER NOT NULL REFERENCES function_abstractions(function_id),
+    vector VECTOR(1536) NOT NULL,
+    CONSTRAINT unique_function_embedding UNIQUE (function_id)
+);
+
+CREATE TABLE syntax_nodes (
+    node_id SERIAL PRIMARY KEY,
+    file_id INTEGER NOT NULL REFERENCES files(file_id),
+    node_type TEXT NOT NULL,
+    start_line INTEGER NOT NULL,
+    end_line INTEGER NOT NULL,
+    node_content TEXT,
+    language TEXT NOT NULL
+);
+
+-- Table to store node relationships
+CREATE TABLE node_relationships (
+    relationship_id SERIAL PRIMARY KEY,
+    parent_node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id),
+    child_node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id),
+    relationship_type TEXT NOT NULL
+);
+
+-- Table to store node properties
+CREATE TABLE node_properties (
+    property_id SERIAL PRIMARY KEY,
+    node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id),
+    property_name TEXT NOT NULL,
+    property_value TEXT NOT NULL
+);
+
+-- Create an index on the file_path column for faster lookups
+CREATE INDEX idx_file_path ON files(file_path);
+
+-- Create an index on the function_name column for faster lookups
+CREATE INDEX idx_function_name ON function_abstractions(function_name);
+
+-- Create an index on the node_type column for faster lookups
+CREATE INDEX idx_node_type ON syntax_nodes(node_type);
+
+-- Create an index on the vector column for faster similarity searches
+CREATE INDEX idx_function_embeddings_vector ON function_embeddings USING ivfflat (vector vector_l2_ops);
diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
@@ -12,6 +12,8 @@ services:
     restart: always
     secrets:
       - github_app_pem
+    networks:
+      - app-network
 
   redis:
     image: "redis:alpine"
@@ -20,7 +22,27 @@ services:
       - REDIS_PASSWORD=${REDIS_PASSWORD}
     ports:
       - "6379:6379"
+    networks:
+      - app-network
+
+  postgres:
+    container_name: postgres
+    build:
+      context: .
+      dockerfile: Dockerfile-postgres
+    env_file:
+      - .env
+    volumes:
+      - ./db_setup/init.sql:/docker-entrypoint-initdb.d/init.sql
+    ports:
+      - "5432:5432"
+    networks:
+      - app-network
 
 secrets:
   github_app_pem:
-    file: ./GITHUB_APP_NIGHTLY.pem
+    file: ./GITHUB_APP_NIGHTLY.pem
+
+networks:
+  app-network:
+    driver: bridge
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -15,13 +15,24 @@ services:
     depends_on:
       - redis
 
+  postgres:
+    image: postgres:16-bullseye
+    env_file:
+      - .env
+    volumes:
+      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
+    ports:
+      - "5432:5432"
+
   redis:
     image: "redis:alpine"
     environment:
       - REDIS_PASSWORD=${REDIS_PASSWORD}
     ports:
       - "6379:6379"
 
+
 secrets:
   github_app_pem:
-    file: ./GITHUB_APP_NIGHTLY.pem
+    file: ./GITHUB_APP_NIGHTLY.pem
+
diff --git a/examples/ragify_codebase/main.py b/examples/ragify_codebase/main.py
@@ -0,0 +1,22 @@
+from kaizen.retriever.llama_index_retriever import RepositoryAnalyzer
+
+# Initialize the analyzer
+analyzer = RepositoryAnalyzer()
+
+# Set up the repository (do this when you first analyze a repo or when you want to update it)
+# analyzer.setup_repository("./github_app/")
+
+# Perform queries (you can do this as many times as you want without calling setup_repository again)
+results = analyzer.query("Find functions that handle authentication")
+for result in results:
+    print(f"File: {result['file_path']}")
+    print(f"Abstraction: {result['abstraction']}")
+    print(f"result:\n{result}")
+    print(f"Relevance Score: {result['relevance_score']}")
+    print("---")
+
+# # If you make changes to the repository and want to update the analysis:
+# analyzer.setup_repository("/path/to/your/repo")
+
+# Then you can query again with the updated data
+results = analyzer.query("authentication")
diff --git a/install_tree_sitter_languages.sh b/install_tree_sitter_languages.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Directory to store the language libraries
+LANGUAGE_DIR="tree_sitter_languages"
+
+# List of languages to install
+LANGUAGES=(
+    "python"
+    "javascript"
+    "typescript"
+    "rust"
+)
+
+# Create the language directory if it doesn't exist
+mkdir -p "$LANGUAGE_DIR"
+
+# Function to install a language
+install_language() {
+    lang=$1
+    echo "Installing Tree-sitter parser for $lang..."
+
+    # Clone the repository if it doesn't exist
+    if [ ! -d "$LANGUAGE_DIR/tree-sitter-$lang" ]; then
+        git clone "https://github.com/tree-sitter/tree-sitter-$lang" "$LANGUAGE_DIR/tree-sitter-$lang"
+    fi
+
+    # Navigate to the repository directory
+    cd "$LANGUAGE_DIR/tree-sitter-$lang"
+
+    # Update submodules
+    git submodule update --init
+
+    # Build the parser using tree-sitter CLI
+    tree-sitter generate
+
+    # Navigate back to the original directory
+    cd ../..
+
+    echo "Tree-sitter parser for $lang installed successfully."
+}
+
+# Install each language
+for lang in "${LANGUAGES[@]}"; do
+    install_language $lang
+done
+
+echo "All Tree-sitter parsers have been installed."
diff --git a/kaizen/llms/provider.py b/kaizen/llms/provider.py
@@ -234,3 +234,12 @@ def get_usage_cost(self, total_usage: Dict[str, int], model: str = None) -> floa
         return litellm.cost_per_token(
             model, total_usage["prompt_tokens"], total_usage["completion_tokens"]
         )
+
+    def get_text_embedding(self, text):
+        # for model in self.config["language_model"]["models"]:
+        #     if model["model_name"] == "embedding":
+        #         break
+        response = self.provider.embedding(
+            model="embedding", input=[text], dimensions=1536, encoding_format="float"
+        )
+        return response["data"], response["usage"]
diff --git a/kaizen/retriever/__init__.py b/kaizen/retriever/__init__.py