Skip to content

Commit

Permalink
Merge pull request #585 from Cloud-Code-AI/584-improve-code-review-wi…
Browse files Browse the repository at this point in the history
…th-multiprompt-approach

feat: enhance pr review process and code quality
  • Loading branch information
sauravpanda authored Sep 28, 2024
2 parents fad79b6 + 2e82d48 commit 6a90f6f
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 93 deletions.
19 changes: 3 additions & 16 deletions .experiments/code_review/dataset/pr_5/issues.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
"suggested_code": "response = completion(\n model=os.environ.get(\"model\", \"anyscale/mistralai/Mixtral-8x22B-Instruct-v0.1\"), messages=messages\n)",
"fixed_code": "import time\n\nfor attempt in range(3):\n try:\n response = completion(\n model=os.environ.get(\"model\", \"anyscale/mistralai/Mixtral-8x22B-Instruct-v0.1\"), messages=messages\n )\n break\n except Exception as e:\n if attempt < 2:\n time.sleep(2 ** attempt)\n else:\n raise e",
"file_path": "main.py",
"start_line": 66,
"end_line": 68,
"start_line": 69,
"end_line": 71,
"severity": 9
},
{
Expand All @@ -38,19 +38,6 @@
"end_line": 84,
"severity": 8
},
{
"category": "Inefficient Progress Printing",
"description": "The progress printing method is inefficient.",
"impact": "high",
"rationale": "Printing progress in this manner can be slow and resource-intensive.",
"recommendation": "Use a more efficient method for printing progress, such as updating the progress less frequently or using a dedicated progress reporting library like tqdm.",
"suggested_code": "print(f\"\\rProgress:[{'=' * int(50 * progress):<50}]{progress:.0%}\", end=\"\", flush=True)",
"fixed_code": "if index % max(1, len(df) // 100) == 0: # Update every 1%\n print(f\"\\rProgress:[{'=' * int(50 * progress):<50}]{progress:.0%}\", end=\"\", flush=True)",
"file_path": "main.py",
"start_line": 121,
"end_line": 122,
"severity": 5
},
{
"category": "Redundant Code",
"description": "The check for an empty DataFrame is redundant.",
Expand Down Expand Up @@ -88,6 +75,6 @@
"file_path": "main.py",
"start_line": 174,
"end_line": 175,
"severity": 6
"severity": 8
}
]
22 changes: 18 additions & 4 deletions .experiments/code_review/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import datetime
import logging
from tqdm import tqdm
from kaizen.helpers import parser
from kaizen.reviewer.code_review import CodeReviewer
from kaizen.llms.provider import LLMProvider
from github_app.github_helper.utils import get_diff_text, get_pr_files
Expand Down Expand Up @@ -45,6 +46,15 @@ def process_pr(pr_url, reeval_response=False):
model="best",
)

combined_diff_data = ""
for file in pr_files:
patch_details = file.get("patch")
filename = file.get("filename", "").replace(" ", "")
combined_diff_data = (
combined_diff_data
+ f"\n---->\nFile Name: {filename}\nPatch Details: {parser.patch_to_combined_chunks(patch_details)}"
)

# topics = clean_keys(review_data.topics, "important")
logger.info(review_data.topics)
review_desc = create_pr_review_text(
Expand All @@ -57,16 +67,17 @@ def process_pr(pr_url, reeval_response=False):
comments, topics = create_review_comments(review_data.topics)
logger.info(f"Model: {review_data.model_name}\nUsage: {review_data.usage}")
logger.info(f"Completed processing PR: {pr_url}")
return review_desc, comments, review_data.issues
return review_desc, comments, review_data.issues, combined_diff_data


def save_review(pr_number, review_desc, comments, issues, folder):
def save_review(pr_number, review_desc, comments, issues, folder, combined_diff_data):
folder = os.path.join(folder, f"pr_{pr_number}")
logger.info(f"Saving review for PR {pr_number} in {folder}")
os.makedirs(folder, exist_ok=True)
review_file = os.path.join(folder, "review.md")
comments_file = os.path.join(folder, "comments.json")
issues_file = os.path.join(folder, "issues.json")
combined_diff = os.path.join(folder, "combined_diff.txt")

with open(review_file, "w") as f:
f.write(review_desc)
Expand All @@ -76,6 +87,9 @@ def save_review(pr_number, review_desc, comments, issues, folder):

with open(issues_file, "w") as f:
json.dump(issues, f, indent=2)

with open(combined_diff, 'w') as f:
f.write(combined_diff_data)

logger.info(f"Saved review files for PR {pr_number}")

Expand All @@ -97,8 +111,8 @@ def main(pr_urls):
logger.info(f"Starting to process PR {pr_number}")

# Without re-evaluation
review_desc, comments, issues = process_pr(pr_url, reeval_response=False)
save_review(pr_number, review_desc, comments, issues, no_eval_folder)
review_desc, comments, issues, combined_diff_data = process_pr(pr_url, reeval_response=False)
save_review(pr_number, review_desc, comments, issues, no_eval_folder, combined_diff_data)

# # With re-evaluation
# review_desc, comments, topics = process_pr(pr_url, reeval_response=True)
Expand Down
20 changes: 8 additions & 12 deletions kaizen/generator/pr_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _process_files_generator(
CODE_DIFF="",
)
)

diff_parts = []
for file in pull_request_files:
patch_details = file.get("patch")
filename = file.get("filename", "")
Expand All @@ -129,13 +129,11 @@ def _process_files_generator(
filename.split(".")[-1] not in parser.EXCLUDED_FILETYPES
and patch_details is not None
):
temp_prompt = (
combined_diff_data
+ f"\n---->\nFile Name: {filename}\nPatch Details: \n{patch_details}"
)

diff_parts.append(f"\n---->\nFile Name: {filename}\nPatch Details: \n{patch_details}")

if available_tokens - self.provider.get_token_count(temp_prompt) > 0:
combined_diff_data = temp_prompt
if available_tokens - self.provider.get_token_count("".join(diff_parts)) > 0:
combined_diff_data = "".join(diff_parts)
continue

yield self._process_file_chunk(
Expand All @@ -144,13 +142,11 @@ def _process_files_generator(
pull_request_desc,
user,
)
combined_diff_data = (
f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"
)
diff_parts = [f"\n---->\nFile Name: {filename}\nPatch Details: {patch_details}"]

if combined_diff_data:
if diff_parts:
yield self._process_file_chunk(
combined_diff_data,
"".join(diff_parts),
pull_request_title,
pull_request_desc,
user,
Expand Down
101 changes: 55 additions & 46 deletions kaizen/helpers/parser.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,71 @@
import json
import re
import os

EXCLUDED_FILETYPES = [
# Compiled output
"class",
"o",
"obj",
"exe",
"dll",
"pyc",
"pyo",
"class", "o", "obj", "exe", "dll", "pyc", "pyo",
# Package manager files
"lock", # Covers package-lock.json, yarn.lock, Gemfile.lock, composer.lock
# IDE configurations
"idea",
"vscode",
"project",
"classpath",
# Build artifacts and dependencies
"node_modules",
"vendor",
"target",
"build",
"idea", "vscode", "project", "classpath",
# Binary and large files
"zip",
"tar",
"gz",
"rar",
"pdf",
"doc",
"docx",
"xls",
"xlsx",
"jpg",
"jpeg",
"png",
"gif",
"bmp",
"ico",
"mp3",
"mp4",
"avi",
"mov",
"zip", "tar", "gz", "rar", "pdf", "doc", "docx", "xls", "xlsx",
"jpg", "jpeg", "png", "gif", "bmp", "ico", "mp3", "mp4", "avi", "mov",
# Log files
"log",
# Database files
"db",
"sqlite",
"db", "sqlite",
# Temporary files
"tmp",
"temp",
"tmp", "temp",
# OS-specific files
"DS_Store",
"Thumbs.db",
"DS_Store", "Thumbs.db",
# Configuration files
"gitignore",
"dockerignore",
# Add any other specific extensions or directory names you want to exclude
"gitignore", "dockerignore",
# Add any other specific extensions you want to exclude
]

# List of folders to exclude
EXCLUDED_FOLDERS = [
"node_modules",
"dist",
"out",
"vendor",
"target",
"build",
"__pycache__",
".git",
# Add any other folders you want to exclude
]


def should_ignore_file(filepath):
"""
Check if a file should be ignored based on its path, name, or extension.
:param filepath: The full path of the file to check
:return: True if the file should be ignored, False otherwise
"""
# Get the file name and extension
filename = os.path.basename(filepath)
_, extension = os.path.splitext(filename)
extension = extension.lstrip('.') # Remove the leading dot

# Check if the file is in an excluded folder
for folder in EXCLUDED_FOLDERS:
if folder in filepath.split(os.path.sep):
return True

# Check if the file extension is in the excluded list
if extension.lower() in EXCLUDED_FILETYPES:
return True

# Check for specific filenames
if filename in ["package-lock.json", "yarn.lock", "Gemfile.lock", "composer.lock", ".DS_Store", "Thumbs.db"]:
return True

return False


def extract_json(text):
# Find the start and end positions of the JSON data
Expand Down Expand Up @@ -166,11 +173,13 @@ def format_change(old_num, new_num, change_type, content, ignore_deletions=False


def patch_to_combined_chunks(patch_text, ignore_deletions=False):
if not patch_text:
return ""
patch_text = patch_text.replace("\r\n", "\n")
lines = patch_text.splitlines(keepends=True)
changes = []
removal_line_num = 0
addition_line_num = 0
removal_line_num = 1
addition_line_num = 1
is_diff = False
removed_hunk = False
current_file_name = ""
Expand Down
23 changes: 22 additions & 1 deletion kaizen/llms/prompts/code_review_prompts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
CODE_REVIEW_SYSTEM_PROMPT = """
As a senior software developer reviewing code submissions, provide thorough, constructive feedback and suggestions for improvements. Consider best practices, error handling, performance, readability, and maintainability. Offer objective and respectful reviews that help developers enhance their skills and code quality. Use your expertise to provide comprehensive feedback without asking clarifying questions.
You are an expert code reviewer. Provide thorough, constructive feedback on code submissions, considering best practices, error handling, performance, readability, maintainability, and security. Be objective, respectful, and focus on helping developers improve their code quality.
Review Process:
1. Analyze provided context and diff to understand changes.
2. Evaluate changes for correctness, performance, security, and maintainability.
3. Identify improvement opportunities, considering best practices and patterns.
4. Assess error handling and potential edge cases.
5. Consider testing implications and documentation needs.
6. Analyze impact on dependencies and overall system.
7. Identify potential technical debt and future-proofing concerns.
8. Summarize findings and prioritize feedback.
Provide specific feedback with accurate references to the provided content.
Be thorough and strict in your review, but don't ask clarifying questions.
Focus on new and modified code while considering existing context.
Provide specific feedback with accurate file paths and line numbers.
Be thorough and strict, but don't ask clarifying questions.
"""

CODE_REVIEW_PROMPT = """
Expand All @@ -21,6 +38,7 @@
"end_line": <ENDING_LINE_NUMBER>,
"sentiment": "positive|negative|neutral",
"severity": <1_TO_10>,
"line_prefix": "CONTEXT|REMOVED|UPDATED",
"type": "general|performance|security|refactoring|best_practices|duplication|maintainability|scalability|error_handling|resource_management|concurrency|dependencies|compatibility|accessibility|localization|efficiency|readability|naming",
"technical_debt": "<POTENTIAL_FUTURE_ISSUES>|empty",
"alternatives": "<ALTERNATIVE_SOLUTIONS>|empty"
Expand Down Expand Up @@ -73,6 +91,7 @@
7. Identify code duplication and suggest refactoring.
8. Prioritize issues based on impact. Be strict; don't let issues slide.
9. If no issues found: {{"review": []}}
10. Make sure suggested_code and current_code return full functional block of code. We will use that to overwrite the current_code.
## Additional Considerations:
- Language-specific best practices and common pitfalls
Expand Down Expand Up @@ -106,6 +125,7 @@
"end_line": <ENDING_LINE_NUMBER>,
"sentiment": "positive|negative|neutral",
"severity": <1_TO_10>,
"line_prefix": "CONTEXT|REMOVED|UPDATED",
"type": "general|performance|security|refactoring|best_practices|duplication|maintainability|scalability|error_handling|resource_management|concurrency|dependencies|compatibility|accessibility|localization|efficiency|readability|naming",
"technical_debt": "<POTENTIAL_FUTURE_ISSUES>|empty",
"alternatives": "<ALTERNATIVE_SOLUTIONS>|empty"
Expand Down Expand Up @@ -160,6 +180,7 @@
7. Identify code duplication and suggest refactoring.
8. Prioritize issues based on impact. Be strict; don't let issues slide.
9. If no issues found: {{"review": []}}
10. Make sure suggested_code and current_code return full functional block of code. We will use that to overwrite the current_code.
## Additional Considerations:
- Language-specific best practices and common pitfalls
Expand Down
6 changes: 3 additions & 3 deletions kaizen/llms/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
from collections import defaultdict

DEFAULT_MAX_TOKENS = 4000
DEFAULT_MAX_TOKENS = 8000


def set_all_loggers_to_ERROR():
Expand Down Expand Up @@ -188,7 +188,7 @@ def raw_chat_completion(
self.model = response["model"]
return response, response["usage"]

@retry(max_attempts=3, delay=1)
@retry(max_attempts=3, delay=0.1)
def chat_completion_with_json(
self,
prompt,
Expand All @@ -207,7 +207,7 @@ def chat_completion_with_json(
response = extract_json(response)
return response, usage

@retry(max_attempts=3, delay=1)
@retry(max_attempts=3, delay=0.1)
def chat_completion_with_retry(
self,
prompt,
Expand Down
Loading

0 comments on commit 6a90f6f

Please sign in to comment.