Skip to content

Commit

Permalink
Merge pull request #222 from Cloud-Code-AI/10-add-rag-support
Browse files Browse the repository at this point in the history
[WIP ] : 10 add rag support
  • Loading branch information
sauravpanda authored Aug 17, 2024
2 parents 77de3aa + b913aeb commit 9165fd5
Show file tree
Hide file tree
Showing 20 changed files with 2,280 additions and 52 deletions.
22 changes: 14 additions & 8 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,36 @@ GITHUB_APP_WEBHOOK_SECRET=
LOG_LEVEL="info"
LITELLM_LOG="ERROR"

# LLM SETUPS
## OPENAI SETUP
# LLM SETUPS [SETUP one of the below]
## 1 - OPENAI SETUP
OPENAI_API_KEY=
OPENAI_ORGANIZATION=
### [OPTIONAL]
OPENAI_API_BASE=

## Azure
## 2 - Azure
AZURE_API_KEY=
AZURE_API_BASE=
AZURE_API_VERSION=


## Anthropic
## 3 - Anthropic
ANTHROPIC_API_KEY=

## Google Gemini
## 4 - Google Gemini
GEMINI_API_KEY=

## LITELLM OBSERVABILITY
## LITELLM OBSERVABILITY [OPTIONAL]
SUPABASE_URL=
SUPABASE_KEY=


## POSTGRES Setup [Necessary for RAG]
POSTGRES_USER=
POSTGRES_PASSWORD=
POSTGRES_DB=

### REDIS SETUP
REDIS_PASSWORD=
REDIS_HOST=localhost
REDIS_PORT=6379
REDIS_PORT=6379

3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,5 @@ cython_debug/
node_modules
.next

.cloudcode
.cloudcode
tree_sitter_languages/
12 changes: 12 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ FROM python:3.12-slim
# Set the working directory in the container
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
git \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Install Poetry
RUN pip install --no-cache-dir poetry

Expand All @@ -16,6 +22,12 @@ RUN poetry install --no-dev --no-root
# Copy the application code into the container
COPY . .

# Make the installation script executable
RUN chmod +x install_tree_sitter_languages.sh

# Run the Tree-sitter language installation script
RUN ./install_tree_sitter_languages.sh

# Expose the port on which the application will run
EXPOSE 8000

Expand Down
18 changes: 18 additions & 0 deletions Dockerfile-postgres
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM postgres:16-bullseye

# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
git \
postgresql-server-dev-16

# Clone and install pgvector
RUN git clone https://github.com/pgvector/pgvector.git \
&& cd pgvector \
&& make \
&& make install

# Clean up
RUN apt-get remove -y build-essential git postgresql-server-dev-16 \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/* /pgvector
21 changes: 21 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,27 @@
"enable_observability_logging": false,
"redis_enabled": true,
"models": [
{
"model_name": "embedding",
"litellm_params": {
"model": "azure/text-embedding-small",
"input_cost_per_token": 0.000000015,
"output_cost_per_token": 0.0000006,
"api_key": "os.environ/AZURE_API_KEY",
"api_base": "os.environ/AZURE_API_BASE"
}
},
{
"model_name": "small",
"litellm_params": {
"model": "azure/gpt-4o-mini",
"input_cost_per_token": 0.000000015,
"output_cost_per_token": 0.0000006,
"api_key": "os.environ/AZURE_API_KEY",
"api_base": "os.environ/AZURE_API_BASE",
"base_model": "azure/gpt-4o-mini"
}
},
{
"model_name": "default",
"litellm_params": {
Expand Down
81 changes: 81 additions & 0 deletions db_setup/init.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
-- Enable vector extension
CREATE EXTENSION IF NOT EXISTS vector;

CREATE TABLE repositories (
repo_id SERIAL PRIMARY KEY,
repo_name TEXT NOT NULL,
repo_owner TEXT NOT NULL,
repo_url TEXT NOT NULL,
repo_description TEXT,
CONSTRAINT unique_repo UNIQUE (repo_name, repo_owner)
);

-- Table to store file information
CREATE TABLE files (
file_id SERIAL PRIMARY KEY,
repo_id INTEGER NOT NULL REFERENCES repositories(repo_id),
file_path TEXT NOT NULL,
file_name TEXT NOT NULL,
file_ext TEXT NOT NULL,
programming_language TEXT,
CONSTRAINT unique_repo_file UNIQUE (repo_id, file_path)
);

-- Table to store function abstractions
CREATE TABLE function_abstractions (
function_id SERIAL PRIMARY KEY,
file_id INTEGER NOT NULL REFERENCES files(file_id),
function_name TEXT NOT NULL,
function_signature TEXT NOT NULL,
abstract_functionality TEXT NOT NULL,
complexity_score FLOAT,
input_output_description TEXT,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL
);

-- Table to store vector embeddings for function abstractions
CREATE TABLE function_embeddings (
embedding_id SERIAL PRIMARY KEY,
function_id INTEGER NOT NULL REFERENCES function_abstractions(function_id),
vector VECTOR(1536) NOT NULL,
CONSTRAINT unique_function_embedding UNIQUE (function_id)
);

CREATE TABLE syntax_nodes (
node_id SERIAL PRIMARY KEY,
file_id INTEGER NOT NULL REFERENCES files(file_id),
node_type TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
node_content TEXT,
language TEXT NOT NULL
);

-- Table to store node relationships
CREATE TABLE node_relationships (
relationship_id SERIAL PRIMARY KEY,
parent_node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id),
child_node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id),
relationship_type TEXT NOT NULL
);

-- Table to store node properties
CREATE TABLE node_properties (
property_id SERIAL PRIMARY KEY,
node_id INTEGER NOT NULL REFERENCES syntax_nodes(node_id),
property_name TEXT NOT NULL,
property_value TEXT NOT NULL
);

-- Create an index on the file_path column for faster lookups
CREATE INDEX idx_file_path ON files(file_path);

-- Create an index on the function_name column for faster lookups
CREATE INDEX idx_function_name ON function_abstractions(function_name);

-- Create an index on the node_type column for faster lookups
CREATE INDEX idx_node_type ON syntax_nodes(node_type);

-- Create an index on the vector column for faster similarity searches
CREATE INDEX idx_function_embeddings_vector ON function_embeddings USING ivfflat (vector vector_l2_ops);
24 changes: 23 additions & 1 deletion docker-compose-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ services:
restart: always
secrets:
- github_app_pem
networks:
- app-network

redis:
image: "redis:alpine"
Expand All @@ -20,7 +22,27 @@ services:
- REDIS_PASSWORD=${REDIS_PASSWORD}
ports:
- "6379:6379"
networks:
- app-network

postgres:
container_name: postgres
build:
context: .
dockerfile: Dockerfile-postgres
env_file:
- .env
volumes:
- ./db_setup/init.sql:/docker-entrypoint-initdb.d/init.sql
ports:
- "5432:5432"
networks:
- app-network

secrets:
github_app_pem:
file: ./GITHUB_APP_NIGHTLY.pem
file: ./GITHUB_APP_NIGHTLY.pem

networks:
app-network:
driver: bridge
13 changes: 12 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,24 @@ services:
depends_on:
- redis

postgres:
image: postgres:16-bullseye
env_file:
- .env
volumes:
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
ports:
- "5432:5432"

redis:
image: "redis:alpine"
environment:
- REDIS_PASSWORD=${REDIS_PASSWORD}
ports:
- "6379:6379"


secrets:
github_app_pem:
file: ./GITHUB_APP_NIGHTLY.pem
file: ./GITHUB_APP_NIGHTLY.pem

22 changes: 22 additions & 0 deletions examples/ragify_codebase/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from kaizen.retriever.llama_index_retriever import RepositoryAnalyzer

# Initialize the analyzer
analyzer = RepositoryAnalyzer()

# Set up the repository (do this when you first analyze a repo or when you want to update it)
# analyzer.setup_repository("./github_app/")

# Perform queries (you can do this as many times as you want without calling setup_repository again)
results = analyzer.query("Find functions that handle authentication")
for result in results:
print(f"File: {result['file_path']}")
print(f"Abstraction: {result['abstraction']}")
print(f"result:\n{result}")
print(f"Relevance Score: {result['relevance_score']}")
print("---")

# # If you make changes to the repository and want to update the analysis:
# analyzer.setup_repository("/path/to/your/repo")

# Then you can query again with the updated data
results = analyzer.query("authentication")
47 changes: 47 additions & 0 deletions install_tree_sitter_languages.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

# Directory to store the language libraries
LANGUAGE_DIR="tree_sitter_languages"

# List of languages to install
LANGUAGES=(
"python"
"javascript"
"typescript"
"rust"
)

# Create the language directory if it doesn't exist
mkdir -p "$LANGUAGE_DIR"

# Function to install a language
install_language() {
lang=$1
echo "Installing Tree-sitter parser for $lang..."

# Clone the repository if it doesn't exist
if [ ! -d "$LANGUAGE_DIR/tree-sitter-$lang" ]; then
git clone "https://github.com/tree-sitter/tree-sitter-$lang" "$LANGUAGE_DIR/tree-sitter-$lang"
fi

# Navigate to the repository directory
cd "$LANGUAGE_DIR/tree-sitter-$lang"

# Update submodules
git submodule update --init

# Build the parser using tree-sitter CLI
tree-sitter generate

# Navigate back to the original directory
cd ../..

echo "Tree-sitter parser for $lang installed successfully."
}

# Install each language
for lang in "${LANGUAGES[@]}"; do
install_language $lang
done

echo "All Tree-sitter parsers have been installed."
9 changes: 9 additions & 0 deletions kaizen/llms/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,12 @@ def get_usage_cost(self, total_usage: Dict[str, int], model: str = None) -> floa
return litellm.cost_per_token(
model, total_usage["prompt_tokens"], total_usage["completion_tokens"]
)

def get_text_embedding(self, text):
# for model in self.config["language_model"]["models"]:
# if model["model_name"] == "embedding":
# break
response = self.provider.embedding(
model="embedding", input=[text], dimensions=1536, encoding_format="float"
)
return response["data"], response["usage"]
Empty file added kaizen/retriever/__init__.py
Empty file.
Loading

0 comments on commit 9165fd5

Please sign in to comment.