From 2570d1642ae79f693931c2dd6428d1ee3d3072e8 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 4 Dec 2024 15:40:21 -0800 Subject: [PATCH 1/9] added evaluator and formatter and main --- recipes/use_cases/prompt-migration/main.py | 61 +++++++++++++ .../prompt_migration/evaluator.py | 90 +++++++++++++++++++ .../prompt_migration/formatters.py | 17 ++++ 3 files changed, 168 insertions(+) create mode 100644 recipes/use_cases/prompt-migration/main.py create mode 100644 recipes/use_cases/prompt-migration/prompt_migration/evaluator.py create mode 100644 recipes/use_cases/prompt-migration/prompt_migration/formatters.py diff --git a/recipes/use_cases/prompt-migration/main.py b/recipes/use_cases/prompt-migration/main.py new file mode 100644 index 000000000..40991468b --- /dev/null +++ b/recipes/use_cases/prompt-migration/main.py @@ -0,0 +1,61 @@ +import dspy +from prompt_migration.engine import PromptMigrationEngine, PromptTemplate +from prompt_migration.evaluator import PromptEvaluator +from prompt_migration.eval_dataset import get_evaluation_dataset, get_eval_subset + +import os +import dotenv + +dotenv.load_dotenv() + +def main(): + openai_lm = dspy.LM( + model="gpt-3.5-turbo", + api_key=os.getenv("OPENAI_API_KEY") + ) + + # target_lm = dspy.LM( + # model="together_ai/togethercomputer/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + # api_key=os.getenv("TOGETHER_API_KEY") + # ) + # target_lm = dspy.LM('ollama_chat/llama3.2:3b-instruct-fp16', api_base='http://localhost:11434', api_key='') + target_lm = dspy.HFModel(model="gpt2") + + engine = PromptMigrationEngine(openai_lm, target_lm) + + source_prompt = PromptTemplate( + template="Write a Python function that takes as input a file path to an image, loads the image into memory as a numpy array, then crops the rows and columns around the perimeter if they are darker than a threshold value. Use the mean value of rows and columns to decide if they should be marked for deletion.", + input_variables=["text"], + model_type="openai" + ) + + eval_dataset = get_evaluation_dataset() + + + # To evaluate on a specific subset, use the following: + #summarization_dataset = get_eval_subset(prompt_type="summarization") + #simple_tasks = get_eval_subset(complexity="simple") + + # Migrate prompt + print("Migrating prompt...") + migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset) + + # Evaluate migration + print("Evaluating migration...") + evaluator = PromptEvaluator(openai_lm, target_lm) + metrics = evaluator.evaluate( + source_prompt.template, + migrated_prompt.template, + eval_dataset + ) + + print(f"\nResults:") + print(f"Original prompt: {source_prompt.template}") + print(f"Migrated prompt: {migrated_prompt.template}") + print(f"Evaluation metrics:") + print(f" Accuracy: {metrics.accuracy:.2f}") + print(f" Similarity: {metrics.similarity:.2f}") + print(f" Consistency: {metrics.consistency:.2f}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py new file mode 100644 index 000000000..33bb09f07 --- /dev/null +++ b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py @@ -0,0 +1,90 @@ +import dspy +from typing import List, Dict +from dataclasses import dataclass + +@dataclass +class EvaluationMetrics: + accuracy: float + similarity: float + consistency: float + +class PromptEvaluator: + def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM): + self.source_lm = source_lm + self.target_lm = target_lm + + def _create_judge(self): + """Create an LLM judge to evaluate prompt outputs.""" + class FactJudge(dspy.Signature): + """Judge if the migrated prompt produces equivalent outputs.""" + source_output = dspy.InputField(desc="Output from source model") + target_output = dspy.InputField(desc="Output from target model") + factually_correct = dspy.OutputField( + desc="Is the target output equivalent to the source output in terms of content and intent?", + prefix="Factual[Yes/No]:" + ) + reasoning = dspy.OutputField(desc="Explanation for the judgment") + + return dspy.ChainOfThought(FactJudge) + + def _get_model_output(self, model, text: str) -> str: + """Helper function to get output from different model types.""" + try: + # Try different methods since DSPy model interfaces can vary + if hasattr(model, '__call__'): + return model(text) + elif hasattr(model, 'generate'): + return model.generate(text) + elif hasattr(model, 'complete'): + return model.complete(text) + else: + raise AttributeError(f"Model {type(model)} has no supported generation method") + except Exception as e: + print(f"Error generating output with {type(model)}: {str(e)}") + return "" + + def _calculate_metrics(self, evaluator, test_cases): + """Calculate evaluation metrics using LLM as judge.""" + total_similarity = 0.0 + total_accuracy = 0.0 + total_consistency = 0.0 + + judge = self._create_judge() + + for case in test_cases: + source_output = self._get_model_output(self.source_lm, case["text"]) + target_output = self._get_model_output(self.target_lm, case["text"]) + + judgment = judge( + source_output=source_output, + target_output=target_output + ) + + is_equivalent = judgment.factually_correct.lower() == "yes" + + similarity = float(is_equivalent) + accuracy = float(target_output.lower() == case["expected_summary"].lower()) + consistency = float(is_equivalent) + + total_similarity += similarity + total_accuracy += accuracy + total_consistency += consistency + + print(f"\nJudge's reasoning: {judgment.reasoning}") + + n = len(test_cases) + return EvaluationMetrics( + accuracy=total_accuracy / n, + similarity=total_similarity / n, + consistency=total_consistency / n + ) + + def evaluate(self, + source_prompt: str, + target_prompt: str, + test_cases: List[Dict]) -> EvaluationMetrics: + """Evaluates the quality of prompt migration using LLM as judge.""" + + metrics = self._calculate_metrics(None, test_cases) # evaluator param not needed anymore + + return metrics \ No newline at end of file diff --git a/recipes/use_cases/prompt-migration/prompt_migration/formatters.py b/recipes/use_cases/prompt-migration/prompt_migration/formatters.py new file mode 100644 index 000000000..42c0043bd --- /dev/null +++ b/recipes/use_cases/prompt-migration/prompt_migration/formatters.py @@ -0,0 +1,17 @@ +from typing import List + +class PromptFormatter: + @staticmethod + def openai_to_llama(prompt: str) -> str: + """Convert OpenAI-style prompts to Llama format.""" + # Basic conversion logic + converted = prompt.replace("{{", "{").replace("}}", "}") + return converted + + @staticmethod + def extract_variables(prompt: str) -> List[str]: + """Extract variable names from a prompt template.""" + import re + pattern = r"\{([^}]+)\}" + matches = re.findall(pattern, prompt) + return list(set(matches)) \ No newline at end of file From 08e41d0d0a10ec06437b88e35420a9f1cd84ccd2 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 4 Dec 2024 15:40:36 -0800 Subject: [PATCH 2/9] add usage guide and init --- .../prompt-migration/examples/usage.py | 37 +++++++++++++++++++ .../prompt_migration/__init__.py | 0 2 files changed, 37 insertions(+) create mode 100644 recipes/use_cases/prompt-migration/examples/usage.py create mode 100644 recipes/use_cases/prompt-migration/prompt_migration/__init__.py diff --git a/recipes/use_cases/prompt-migration/examples/usage.py b/recipes/use_cases/prompt-migration/examples/usage.py new file mode 100644 index 000000000..883fa9737 --- /dev/null +++ b/recipes/use_cases/prompt-migration/examples/usage.py @@ -0,0 +1,37 @@ +import dspy +from prompt_migration.engine import PromptMigrationEngine, PromptTemplate +from prompt_migration.evaluator import PromptEvaluator + +# Initialize LMs +openai_lm = dspy.OpenAI(model="gpt-3.5-turbo") +target_lm = dspy.HFModel(model="gpt2") + +# Create migration engine +engine = PromptMigrationEngine(openai_lm, target_lm) + +# Define source prompt +source_prompt = PromptTemplate( + template="Summarize the following text: {text}", + input_variables=["text"], + model_type="openai" +) + +# Example evaluation dataset +eval_dataset = [ + {"text": "Example text 1", "expected_summary": "Summary 1"}, + {"text": "Example text 2", "expected_summary": "Summary 2"}, +] + +# Migrate prompt +migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset) + +# Evaluate migration +evaluator = PromptEvaluator(openai_lm, target_lm) +metrics = evaluator.evaluate( + source_prompt.template, + migrated_prompt.template, + eval_dataset +) + +print(f"Migrated prompt: {migrated_prompt.template}") +print(f"Evaluation metrics: {metrics}") \ No newline at end of file diff --git a/recipes/use_cases/prompt-migration/prompt_migration/__init__.py b/recipes/use_cases/prompt-migration/prompt_migration/__init__.py new file mode 100644 index 000000000..e69de29bb From a3e96e4e46d2368e07059a0dbd07bab5a935a453 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 4 Dec 2024 15:40:55 -0800 Subject: [PATCH 3/9] add engine and eval dataset --- .../prompt_migration/engine.py | 68 ++++++++++ .../prompt_migration/eval_dataset.py | 123 ++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 recipes/use_cases/prompt-migration/prompt_migration/engine.py create mode 100644 recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py diff --git a/recipes/use_cases/prompt-migration/prompt_migration/engine.py b/recipes/use_cases/prompt-migration/prompt_migration/engine.py new file mode 100644 index 000000000..a9093cfae --- /dev/null +++ b/recipes/use_cases/prompt-migration/prompt_migration/engine.py @@ -0,0 +1,68 @@ +import dspy +from typing import List, Dict, Optional +from dataclasses import dataclass + +@dataclass +class PromptTemplate: + template: str + input_variables: List[str] + model_type: str # 'openai' or 'llama' + +class PromptMigrationEngine: + def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM): + self.source_lm = source_lm + self.target_lm = target_lm + dspy.configure(lm=source_lm) + + def _optimize_transformation(self, transformer, eval_dataset): + """Optimize the transformation using the evaluation dataset.""" + class AccuracyMetric: + def __call__(self, example, prediction, trace=None): + return float(prediction.target == example.expected_output) + + optimizer = dspy.BootstrapFewShotWithRandomSearch( + metric=AccuracyMetric(), + max_bootstrapped_demos=4, + max_labeled_demos=4, + num_threads=4 + ) + + train_data = [ + dspy.Example( + source=item["text"], + expected_output=item["expected_summary"] + ).with_inputs("source") for item in eval_dataset + ] + + return optimizer.compile(transformer, trainset=train_data) + + def migrate_prompt(self, + source_prompt: PromptTemplate, + eval_dataset: Optional[List[Dict]] = None) -> PromptTemplate: + """Migrates a prompt from source LM to target LM format.""" + + class PromptTransformation(dspy.Signature): + """Convert a prompt from one format to another.""" + source = dspy.InputField(desc="Source prompt template") + target = dspy.OutputField(desc="Transformed prompt template") + + class Transformer(dspy.Module): + def __init__(self): + super().__init__() + self.chain = dspy.ChainOfThought(PromptTransformation) + + def forward(self, source): + return self.chain(source=source) + + transformer = Transformer() + + if eval_dataset: + transformer = self._optimize_transformation(transformer, eval_dataset) + + result = transformer(source=source_prompt.template) + + return PromptTemplate( + template=result.target, + input_variables=source_prompt.input_variables, + model_type='llama' + ) \ No newline at end of file diff --git a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py new file mode 100644 index 000000000..7963aad25 --- /dev/null +++ b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py @@ -0,0 +1,123 @@ +from typing import List, Dict + +def get_evaluation_dataset() -> List[Dict]: + """ + Returns a comprehensive evaluation dataset for testing prompt migrations. + Each test case includes: + - text: Input text + - expected_summary: Expected output + - prompt_type: Type of prompt (summarization, classification, qa, etc.) + - complexity: Difficulty level (simple, medium, complex) + """ + return [ + # Summarization examples + { + "text": "The quick brown fox jumps over the lazy dog.", + "expected_summary": "A fox jumps over a dog.", + "prompt_type": "summarization", + "complexity": "simple" + }, + { + "text": """Machine learning is a subset of artificial intelligence that focuses on developing + systems that can learn from and make decisions based on data. It has numerous + applications in various fields including healthcare, finance, and autonomous vehicles.""", + "expected_summary": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.", + "prompt_type": "summarization", + "complexity": "medium" + }, + + # Classification examples + { + "text": "I absolutely loved this product! Best purchase ever!", + "expected_summary": "Positive", + "prompt_type": "sentiment_classification", + "complexity": "simple" + }, + { + "text": "The product works fine but the customer service could be better.", + "expected_summary": "Neutral", + "prompt_type": "sentiment_classification", + "complexity": "medium" + }, + + # Question-Answering examples + { + "text": "What is the capital of France? Context: Paris is the capital and largest city of France.", + "expected_summary": "Paris", + "prompt_type": "qa", + "complexity": "simple" + }, + { + "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets. + Water vapor in warm air rises and cools, forming clouds. When the droplets become too + heavy, they fall as rain.""", + "expected_summary": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.", + "prompt_type": "qa", + "complexity": "medium" + }, + + # Code-related examples + { + "text": "Write a function to add two numbers in Python.", + "expected_summary": "def add(a, b):\n return a + b", + "prompt_type": "code_generation", + "complexity": "simple" + }, + { + "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2", + "expected_summary": "This code multiplies each element in the array 'arr' by 2.", + "prompt_type": "code_explanation", + "complexity": "simple" + }, + + # Text transformation examples + { + "text": "convert this to passive voice: The cat chased the mouse.", + "expected_summary": "The mouse was chased by the cat.", + "prompt_type": "text_transformation", + "complexity": "simple" + }, + { + "text": "translate to French: Hello, how are you?", + "expected_summary": "Bonjour, comment allez-vous?", + "prompt_type": "translation", + "complexity": "simple" + }, + + # Complex reasoning examples + { + "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves + Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations + are 375 miles apart, at what time will the trains meet?""", + "expected_summary": "The trains will meet at 5:00 PM.", + "prompt_type": "problem_solving", + "complexity": "complex" + }, + { + "text": """Analyze the environmental impact of electric vehicles versus traditional + gasoline vehicles, considering manufacturing, operation, and disposal.""", + "expected_summary": """Electric vehicles typically have higher manufacturing emissions but lower + operational emissions compared to gasoline vehicles. Overall lifecycle + environmental impact depends on electricity source and battery recycling.""", + "prompt_type": "analysis", + "complexity": "complex" + } + ] + +def get_eval_subset(prompt_type: str = None, complexity: str = None) -> List[Dict]: + """ + Returns a filtered subset of the evaluation dataset based on prompt type and/or complexity. + + Args: + prompt_type: Type of prompts to filter (e.g., 'summarization', 'qa', etc.) + complexity: Complexity level to filter (e.g., 'simple', 'medium', 'complex') + """ + dataset = get_evaluation_dataset() + + if prompt_type: + dataset = [d for d in dataset if d["prompt_type"] == prompt_type] + + if complexity: + dataset = [d for d in dataset if d["complexity"] == complexity] + + return dataset \ No newline at end of file From 096249bf33b12c2eb4cc7ad135d1b566e696b5ab Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 4 Dec 2024 15:41:27 -0800 Subject: [PATCH 4/9] add .env settings and configure yml --- .gitignore | 2 ++ recipes/use_cases/prompt-migration/.env.template | 5 +++++ recipes/use_cases/prompt-migration/environment.yml | 13 +++++++++++++ 3 files changed, 20 insertions(+) create mode 100644 recipes/use_cases/prompt-migration/.env.template create mode 100644 recipes/use_cases/prompt-migration/environment.yml diff --git a/.gitignore b/.gitignore index 3ee7b311c..5fddf5044 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ __pycache__ .ipynb_checkpoints wandb/ artifacts/ + +**/.env \ No newline at end of file diff --git a/recipes/use_cases/prompt-migration/.env.template b/recipes/use_cases/prompt-migration/.env.template new file mode 100644 index 000000000..4913ba9e4 --- /dev/null +++ b/recipes/use_cases/prompt-migration/.env.template @@ -0,0 +1,5 @@ +OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE +REPLICATE_API_TOKEN=YOUR_REPLICATE_TOKEN_HERE +DATABRICKS_API_TOKEN=your_databricks_token_here +DATABRICKS_API_URL=your_databricks_endpoint_url +TOGETHER_API_KEY=your_together_ai_key_here diff --git a/recipes/use_cases/prompt-migration/environment.yml b/recipes/use_cases/prompt-migration/environment.yml new file mode 100644 index 000000000..212d492df --- /dev/null +++ b/recipes/use_cases/prompt-migration/environment.yml @@ -0,0 +1,13 @@ +name: prompt-migration +channels: + - defaults + - pytorch +dependencies: + - python=3.9 + - pip + - pip: + - dspy-ai + - torch + - transformers + - openai + - databricks-sdk \ No newline at end of file From 263b8b569d1be0ded6c757d0bdfee3b2f46c5b5f Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 4 Dec 2024 15:49:59 -0800 Subject: [PATCH 5/9] placeholder readme --- recipes/use_cases/prompt-migration/readme.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 recipes/use_cases/prompt-migration/readme.md diff --git a/recipes/use_cases/prompt-migration/readme.md b/recipes/use_cases/prompt-migration/readme.md new file mode 100644 index 000000000..503fa1da0 --- /dev/null +++ b/recipes/use_cases/prompt-migration/readme.md @@ -0,0 +1 @@ +#TODO \ No newline at end of file From 43a2cbc22037a37db5a782657c364b92e66c5d76 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 4 Dec 2024 17:23:19 -0800 Subject: [PATCH 6/9] adding eval dataset --- .../prompt_migration/eval_dataset.py | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py index 7963aad25..8a6c0bfae 100644 --- a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py +++ b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py @@ -101,6 +101,163 @@ def get_evaluation_dataset() -> List[Dict]: environmental impact depends on electricity source and battery recycling.""", "prompt_type": "analysis", "complexity": "complex" + }, + + # Code Generation + { + "text": "Write a Python function to check if a number is prime.", + "expected_summary": """def is_prime(n): + if n < 2: + return False + for i in range(2, int(n ** 0.5) + 1): + if n % i == 0: + return False + return True""", + "prompt_type": "code_generation", + "complexity": "medium" + }, + { + "text": "Create a Python function to reverse a string.", + "expected_summary": """def reverse_string(s): + return s[::-1]""", + "prompt_type": "code_generation", + "complexity": "simple" + }, + + { + "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]", + "expected_summary": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.", + "prompt_type": "code_explanation", + "complexity": "medium" + }, + + { + "text": "Write a Python function to implement binary search.", + "expected_summary": """def binary_search(arr, target): + left, right = 0, len(arr) - 1 + + while left <= right: + mid = (left + right) // 2 + if arr[mid] == target: + return mid + elif arr[mid] < target: + left = mid + 1 + else: + right = mid - 1 + + return -1""", + "prompt_type": "code_generation", + "complexity": "medium" + }, + + { + "text": "Implement a Stack class in Python using a list.", + "expected_summary": """class Stack: + def __init__(self): + self.items = [] + + def push(self, item): + self.items.append(item) + + def pop(self): + if not self.is_empty(): + return self.items.pop() + + def is_empty(self): + return len(self.items) == 0 + + def peek(self): + if not self.is_empty(): + return self.items[-1]""", + "prompt_type": "code_generation", + "complexity": "medium" + }, + + { + "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)", + "expected_summary": """def factorial(n): + if n == 0 or n == 1: + return 1 + return n * factorial(n-1)""", + "prompt_type": "code_debugging", + "complexity": "medium" + }, + + { + "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n", + "expected_summary": """def fibonacci(n): + if n <= 1: + return n + a, b = 0, 1 + for _ in range(2, n + 1): + a, b = b, a + b + return b""", + "prompt_type": "code_optimization", + "complexity": "medium" + }, + + { + "text": "Write a Python function using requests to fetch data from a REST API endpoint.", + "expected_summary": """import requests + +def fetch_data(url, params=None): + try: + response = requests.get(url, params=params) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + print(f"Error fetching data: {e}") + return None""", + "prompt_type": "code_generation", + "complexity": "medium" + }, + + { + "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.", + "expected_summary": """import csv + +def read_csv(file_path): + data = [] + try: + with open(file_path, 'r') as file: + reader = csv.DictReader(file) + for row in reader: + data.append(row) + return data + except Exception as e: + print(f"Error reading CSV: {e}") + return None""", + "prompt_type": "code_generation", + "complexity": "medium" + }, + + { + "text": "Write a Python function that safely converts a string to integer with error handling.", + "expected_summary": """def safe_int_convert(s): + try: + return int(s), None + except ValueError as e: + return None, str(e)""", + "prompt_type": "code_generation", + "complexity": "simple" + }, + + # Complex Algorithm + { + "text": "Implement a Python function for Depth-First Search on a graph.", + "expected_summary": """def dfs(graph, start, visited=None): + if visited is None: + visited = set() + + visited.add(start) + + for next_node in graph[start]: + if next_node not in visited: + dfs(graph, next_node, visited) + + return visited""", + "prompt_type": "code_generation", + "complexity": "complex" } ] From b85811d0b9891413485b179731ba67cf42028bb5 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Thu, 5 Dec 2024 11:22:10 -0800 Subject: [PATCH 7/9] change eval dataset, include more robust judging, improved main --- .../prompt-migration/examples/usage.py | 5 +- recipes/use_cases/prompt-migration/main.py | 62 +++++- .../prompt_migration/engine.py | 66 ++++-- .../prompt_migration/eval_dataset.py | 58 ++--- .../prompt_migration/evaluator.py | 198 ++++++++++++++---- 5 files changed, 291 insertions(+), 98 deletions(-) diff --git a/recipes/use_cases/prompt-migration/examples/usage.py b/recipes/use_cases/prompt-migration/examples/usage.py index 883fa9737..2c88008a5 100644 --- a/recipes/use_cases/prompt-migration/examples/usage.py +++ b/recipes/use_cases/prompt-migration/examples/usage.py @@ -16,10 +16,9 @@ model_type="openai" ) -# Example evaluation dataset eval_dataset = [ - {"text": "Example text 1", "expected_summary": "Summary 1"}, - {"text": "Example text 2", "expected_summary": "Summary 2"}, + {"text": "Example text 1", "expected_answer": "Summary 1"}, + {"text": "Example text 2", "expected_answer": "Summary 2"}, ] # Migrate prompt diff --git a/recipes/use_cases/prompt-migration/main.py b/recipes/use_cases/prompt-migration/main.py index 40991468b..83eb1fc46 100644 --- a/recipes/use_cases/prompt-migration/main.py +++ b/recipes/use_cases/prompt-migration/main.py @@ -14,17 +14,48 @@ def main(): api_key=os.getenv("OPENAI_API_KEY") ) - # target_lm = dspy.LM( - # model="together_ai/togethercomputer/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", - # api_key=os.getenv("TOGETHER_API_KEY") - # ) + target_lm = dspy.LM( + model="together_ai/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + api_key=os.getenv("TOGETHER_API_KEY") + ) + # To run it with ollama # target_lm = dspy.LM('ollama_chat/llama3.2:3b-instruct-fp16', api_base='http://localhost:11434', api_key='') - target_lm = dspy.HFModel(model="gpt2") + + # To run it with huggingface + # target_lm = dspy.HFModel(model="gpt2") engine = PromptMigrationEngine(openai_lm, target_lm) source_prompt = PromptTemplate( - template="Write a Python function that takes as input a file path to an image, loads the image into memory as a numpy array, then crops the rows and columns around the perimeter if they are darker than a threshold value. Use the mean value of rows and columns to decide if they should be marked for deletion.", + template="""You are an advanced Large Language Model tasked with generating Python code snippets in response to user prompts. Your primary objective is to provide accurate, concise, and well-structured Python functions. Follow these guidelines: + + Understand the Context: Analyze the input prompt and identify its category (e.g., API Usage, File Handling, Error Handling). + + Generate Code: + Write Python code that directly addresses the user's request. + Ensure the code is syntactically correct, functional, and adheres to Python best practices. + Include necessary imports and handle potential edge cases. + + Error Handling: + Include appropriate error handling where applicable (e.g., try-except blocks). + If exceptions occur, provide meaningful error messages. + + Readability: + Use clear variable names and include comments where necessary for clarity. + Prioritize readability and maintainability in all generated code. + + Complexity Alignment: + Tailor the code's complexity based on the indicated difficulty (e.g., simple, medium, complex). + Ensure that the solution is neither overly simplistic nor unnecessarily complicated. + + Prompt Type: + Focus on the code_generation type for creating Python functions. + Avoid deviating from the task unless additional clarification is requested. + + Testing and Validity: + Assume the function might be run immediately. Provide code that is ready for use or minimal adaptation. + Highlight any dependencies or external libraries required. + """, input_variables=["text"], model_type="openai" ) @@ -33,20 +64,31 @@ def main(): # To evaluate on a specific subset, use the following: - #summarization_dataset = get_eval_subset(prompt_type="summarization") + code_generation_dataset = get_eval_subset(prompt_type="code_generation") #simple_tasks = get_eval_subset(complexity="simple") + evaluator = PromptEvaluator(openai_lm, target_lm) + + metrics = evaluator.evaluate( + source_prompt.template, # Same prompt for both + source_prompt.template, # Same prompt for both + code_generation_dataset + ) + + print(f"Evaluation metrics:") + print(f" Accuracy: {metrics.accuracy:.2f}") + print(f" Similarity: {metrics.similarity:.2f}") + print(f" Consistency: {metrics.consistency:.2f}") # Migrate prompt print("Migrating prompt...") - migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset) + migrated_prompt = engine.migrate_prompt(source_prompt, code_generation_dataset) # Evaluate migration print("Evaluating migration...") - evaluator = PromptEvaluator(openai_lm, target_lm) metrics = evaluator.evaluate( source_prompt.template, migrated_prompt.template, - eval_dataset + code_generation_dataset ) print(f"\nResults:") diff --git a/recipes/use_cases/prompt-migration/prompt_migration/engine.py b/recipes/use_cases/prompt-migration/prompt_migration/engine.py index a9093cfae..319abc781 100644 --- a/recipes/use_cases/prompt-migration/prompt_migration/engine.py +++ b/recipes/use_cases/prompt-migration/prompt_migration/engine.py @@ -9,30 +9,53 @@ class PromptTemplate: model_type: str # 'openai' or 'llama' class PromptMigrationEngine: - def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM): + def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM): self.source_lm = source_lm self.target_lm = target_lm dspy.configure(lm=source_lm) def _optimize_transformation(self, transformer, eval_dataset): """Optimize the transformation using the evaluation dataset.""" - class AccuracyMetric: + class PromptQualityMetric: + def __init__(self, source_lm, target_lm): + self.source_lm = source_lm + self.target_lm = target_lm + def __call__(self, example, prediction, trace=None): - return float(prediction.target == example.expected_output) + if not hasattr(prediction, 'target'): + return 0.0 + + try: + # Get outputs from both models using the prompts + source_output = self.source_lm(example.source) + target_output = self.target_lm(prediction.target) + + # Compare outputs (basic similarity) + from difflib import SequenceMatcher + similarity = SequenceMatcher(None, + str(source_output), + str(target_output)).ratio() + return similarity + except Exception as e: + print(f"Error in metric: {e}") + return 0.0 optimizer = dspy.BootstrapFewShotWithRandomSearch( - metric=AccuracyMetric(), - max_bootstrapped_demos=4, - max_labeled_demos=4, - num_threads=4 + metric=PromptQualityMetric(self.source_lm, self.target_lm), + max_bootstrapped_demos=2, + max_labeled_demos=2, + num_threads=1 ) - train_data = [ - dspy.Example( + # Prepare training data + train_data = [] + for item in eval_dataset: + # Create example with both prompt and expected output + example = dspy.Example( source=item["text"], - expected_output=item["expected_summary"] - ).with_inputs("source") for item in eval_dataset - ] + expected_output=item["expected_answer"] + ).with_inputs("source") + train_data.append(example) return optimizer.compile(transformer, trainset=train_data) @@ -44,7 +67,7 @@ def migrate_prompt(self, class PromptTransformation(dspy.Signature): """Convert a prompt from one format to another.""" source = dspy.InputField(desc="Source prompt template") - target = dspy.OutputField(desc="Transformed prompt template") + target = dspy.OutputField(desc="Transformed prompt template that maintains functionality while adapting to target model format") class Transformer(dspy.Module): def __init__(self): @@ -52,7 +75,18 @@ def __init__(self): self.chain = dspy.ChainOfThought(PromptTransformation) def forward(self, source): - return self.chain(source=source) + # Add context about the transformation task + prompt = f""" + Transform this prompt while: + 1. Maintaining core functionality + 2. Adapting to target model format + 3. Preserving input variables + 4. Keeping essential instructions + + Source prompt: + {source} + """ + return self.chain(source=prompt) transformer = Transformer() @@ -61,6 +95,10 @@ def forward(self, source): result = transformer(source=source_prompt.template) + # Format for target model + if source_prompt.model_type == "openai" and "llama" in str(self.target_lm): + result.target = f"### Instruction:\n{result.target}\n\n### Response:" + return PromptTemplate( template=result.target, input_variables=source_prompt.input_variables, diff --git a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py index 8a6c0bfae..c7fae9f14 100644 --- a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py +++ b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py @@ -5,7 +5,7 @@ def get_evaluation_dataset() -> List[Dict]: Returns a comprehensive evaluation dataset for testing prompt migrations. Each test case includes: - text: Input text - - expected_summary: Expected output + - expected_answer: Expected output - prompt_type: Type of prompt (summarization, classification, qa, etc.) - complexity: Difficulty level (simple, medium, complex) """ @@ -13,7 +13,7 @@ def get_evaluation_dataset() -> List[Dict]: # Summarization examples { "text": "The quick brown fox jumps over the lazy dog.", - "expected_summary": "A fox jumps over a dog.", + "expected_answer": "A fox jumps over a dog.", "prompt_type": "summarization", "complexity": "simple" }, @@ -21,7 +21,7 @@ def get_evaluation_dataset() -> List[Dict]: "text": """Machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from and make decisions based on data. It has numerous applications in various fields including healthcare, finance, and autonomous vehicles.""", - "expected_summary": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.", + "expected_answer": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.", "prompt_type": "summarization", "complexity": "medium" }, @@ -29,13 +29,13 @@ def get_evaluation_dataset() -> List[Dict]: # Classification examples { "text": "I absolutely loved this product! Best purchase ever!", - "expected_summary": "Positive", + "expected_answer": "Positive", "prompt_type": "sentiment_classification", "complexity": "simple" }, { "text": "The product works fine but the customer service could be better.", - "expected_summary": "Neutral", + "expected_answer": "Neutral", "prompt_type": "sentiment_classification", "complexity": "medium" }, @@ -43,7 +43,7 @@ def get_evaluation_dataset() -> List[Dict]: # Question-Answering examples { "text": "What is the capital of France? Context: Paris is the capital and largest city of France.", - "expected_summary": "Paris", + "expected_answer": "Paris", "prompt_type": "qa", "complexity": "simple" }, @@ -51,7 +51,7 @@ def get_evaluation_dataset() -> List[Dict]: "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets. Water vapor in warm air rises and cools, forming clouds. When the droplets become too heavy, they fall as rain.""", - "expected_summary": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.", + "expected_answer": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.", "prompt_type": "qa", "complexity": "medium" }, @@ -59,13 +59,13 @@ def get_evaluation_dataset() -> List[Dict]: # Code-related examples { "text": "Write a function to add two numbers in Python.", - "expected_summary": "def add(a, b):\n return a + b", + "expected_answer": "def add(a, b):\n return a + b", "prompt_type": "code_generation", "complexity": "simple" }, { "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2", - "expected_summary": "This code multiplies each element in the array 'arr' by 2.", + "expected_answer": "This code multiplies each element in the array 'arr' by 2.", "prompt_type": "code_explanation", "complexity": "simple" }, @@ -73,13 +73,13 @@ def get_evaluation_dataset() -> List[Dict]: # Text transformation examples { "text": "convert this to passive voice: The cat chased the mouse.", - "expected_summary": "The mouse was chased by the cat.", + "expected_answer": "The mouse was chased by the cat.", "prompt_type": "text_transformation", "complexity": "simple" }, { "text": "translate to French: Hello, how are you?", - "expected_summary": "Bonjour, comment allez-vous?", + "expected_answer": "Bonjour, comment allez-vous?", "prompt_type": "translation", "complexity": "simple" }, @@ -89,24 +89,24 @@ def get_evaluation_dataset() -> List[Dict]: "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations are 375 miles apart, at what time will the trains meet?""", - "expected_summary": "The trains will meet at 5:00 PM.", + "expected_answer": "The trains will meet at 5:00 PM.", "prompt_type": "problem_solving", "complexity": "complex" }, { "text": """Analyze the environmental impact of electric vehicles versus traditional gasoline vehicles, considering manufacturing, operation, and disposal.""", - "expected_summary": """Electric vehicles typically have higher manufacturing emissions but lower + "expected_answer": """Electric vehicles typically have higher manufacturing emissions but lower operational emissions compared to gasoline vehicles. Overall lifecycle environmental impact depends on electricity source and battery recycling.""", "prompt_type": "analysis", "complexity": "complex" }, - # Code Generation + # Simple Code Generation { "text": "Write a Python function to check if a number is prime.", - "expected_summary": """def is_prime(n): + "expected_answer": """def is_prime(n): if n < 2: return False for i in range(2, int(n ** 0.5) + 1): @@ -118,22 +118,24 @@ def get_evaluation_dataset() -> List[Dict]: }, { "text": "Create a Python function to reverse a string.", - "expected_summary": """def reverse_string(s): + "expected_answer": """def reverse_string(s): return s[::-1]""", "prompt_type": "code_generation", "complexity": "simple" }, + # Code Explanation { "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]", - "expected_summary": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.", + "expected_answer": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.", "prompt_type": "code_explanation", "complexity": "medium" }, + # Algorithm Implementation { "text": "Write a Python function to implement binary search.", - "expected_summary": """def binary_search(arr, target): + "expected_answer": """def binary_search(arr, target): left, right = 0, len(arr) - 1 while left <= right: @@ -150,9 +152,10 @@ def get_evaluation_dataset() -> List[Dict]: "complexity": "medium" }, + # Data Structure Implementation { "text": "Implement a Stack class in Python using a list.", - "expected_summary": """class Stack: + "expected_answer": """class Stack: def __init__(self): self.items = [] @@ -173,9 +176,10 @@ def peek(self): "complexity": "medium" }, + # Code Debugging { "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)", - "expected_summary": """def factorial(n): + "expected_answer": """def factorial(n): if n == 0 or n == 1: return 1 return n * factorial(n-1)""", @@ -183,9 +187,10 @@ def peek(self): "complexity": "medium" }, + # Code Optimization { "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n", - "expected_summary": """def fibonacci(n): + "expected_answer": """def fibonacci(n): if n <= 1: return n a, b = 0, 1 @@ -196,9 +201,10 @@ def peek(self): "complexity": "medium" }, + # API Usage { "text": "Write a Python function using requests to fetch data from a REST API endpoint.", - "expected_summary": """import requests + "expected_answer": """import requests def fetch_data(url, params=None): try: @@ -212,9 +218,10 @@ def fetch_data(url, params=None): "complexity": "medium" }, + # File Handling { "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.", - "expected_summary": """import csv + "expected_answer": """import csv def read_csv(file_path): data = [] @@ -231,9 +238,10 @@ def read_csv(file_path): "complexity": "medium" }, + # Error Handling { "text": "Write a Python function that safely converts a string to integer with error handling.", - "expected_summary": """def safe_int_convert(s): + "expected_answer": """def safe_int_convert(s): try: return int(s), None except ValueError as e: @@ -245,7 +253,7 @@ def read_csv(file_path): # Complex Algorithm { "text": "Implement a Python function for Depth-First Search on a graph.", - "expected_summary": """def dfs(graph, start, visited=None): + "expected_answer": """def dfs(graph, start, visited=None): if visited is None: visited = set() diff --git a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py index 33bb09f07..2607e68ca 100644 --- a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py +++ b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py @@ -1,90 +1,196 @@ -import dspy +import json from typing import List, Dict from dataclasses import dataclass +import dspy +import os +from datetime import datetime @dataclass class EvaluationMetrics: accuracy: float similarity: float consistency: float + individual_scores: List[Dict] # Store individual test case scores class PromptEvaluator: - def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM): + def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM): self.source_lm = source_lm self.target_lm = target_lm + dspy.configure(lm=source_lm) # Configure DSPy to use source_lm for judge def _create_judge(self): - """Create an LLM judge to evaluate prompt outputs.""" - class FactJudge(dspy.Signature): - """Judge if the migrated prompt produces equivalent outputs.""" - source_output = dspy.InputField(desc="Output from source model") - target_output = dspy.InputField(desc="Output from target model") - factually_correct = dspy.OutputField( - desc="Is the target output equivalent to the source output in terms of content and intent?", - prefix="Factual[Yes/No]:" + """Create an LLM judge to evaluate outputs.""" + class OutputJudge(dspy.Signature): + """Judge the quality and equivalence of outputs.""" + input_text = dspy.InputField(desc="The coding task") + source_output = dspy.InputField(desc="Output from source prompt") + target_output = dspy.InputField(desc="Output from target prompt") + expected_output = dspy.InputField(desc="Expected output from dataset") + + equivalence = dspy.OutputField( + desc="Are the outputs functionally equivalent to the expected output? Answer ONLY with 'yes' or 'no'." + ) + accuracy = dspy.OutputField( + desc="Rate how well the outputs match the expected output. Provide ONLY a number between 0 and 100, no text." ) - reasoning = dspy.OutputField(desc="Explanation for the judgment") + consistency = dspy.OutputField( + desc="Rate how consistent the outputs are with each other. Provide ONLY a number between 0 and 100, no text." + ) + reasoning = dspy.OutputField( + desc="Explain your evaluation, focusing on functionality and correctness." + ) + + class Judge(dspy.Module): + def __init__(self): + super().__init__() + self.judge = dspy.ChainOfThought(OutputJudge) + + def forward(self, input_text, source_output, target_output, expected_output): + try: + result = self.judge( + input_text=input_text, + source_output=source_output, + target_output=target_output, + expected_output=expected_output + ) + + # Ensure numeric scores + def clean_score(score): + try: + # Extract just numbers + import re + numbers = re.findall(r'\d+', str(score)) + return float(numbers[0]) if numbers else 0.0 + except: + return 0.0 + + result.accuracy = clean_score(result.accuracy) + result.consistency = clean_score(result.consistency) + result.equivalence = str(result.equivalence).lower().strip() + + return result + except Exception as e: + print(f"Error in judge: {str(e)}") + # Return default scores + return type('Result', (), { + 'accuracy': '0', + 'consistency': '0', + 'equivalence': 'no', + 'reasoning': f'Error in evaluation: {str(e)}' + })() - return dspy.ChainOfThought(FactJudge) + return Judge() - def _get_model_output(self, model, text: str) -> str: - """Helper function to get output from different model types.""" + def _get_model_output(self, prompt: str, input_text: str) -> str: + """Get output from target model using the provided prompt.""" try: - # Try different methods since DSPy model interfaces can vary - if hasattr(model, '__call__'): - return model(text) - elif hasattr(model, 'generate'): - return model.generate(text) - elif hasattr(model, 'complete'): - return model.complete(text) - else: - raise AttributeError(f"Model {type(model)} has no supported generation method") + formatted_prompt = prompt.format(text=input_text) + response = self.target_lm(formatted_prompt) + + if isinstance(response, list): + return response[0] if response else "" + return str(response) except Exception as e: - print(f"Error generating output with {type(model)}: {str(e)}") + print(f"Error generating output: {str(e)}") return "" - def _calculate_metrics(self, evaluator, test_cases): - """Calculate evaluation metrics using LLM as judge.""" + def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: List[Dict]) -> EvaluationMetrics: + """Calculate evaluation metrics using target model for both prompts.""" total_similarity = 0.0 total_accuracy = 0.0 total_consistency = 0.0 + individual_scores = [] judge = self._create_judge() + num_cases = len(test_cases) for case in test_cases: - source_output = self._get_model_output(self.source_lm, case["text"]) - target_output = self._get_model_output(self.target_lm, case["text"]) + input_text = case["text"] + expected = case["expected_answer"] + + # Get outputs from target model using both prompts + source_output = self._get_model_output(source_prompt, input_text) + target_output = self._get_model_output(target_prompt, input_text) judgment = judge( + input_text=input_text, source_output=source_output, - target_output=target_output + target_output=target_output, + expected_output=expected ) - is_equivalent = judgment.factually_correct.lower() == "yes" + # Calculate scores + accuracy_score = float(judgment.accuracy) / 100 + consistency_score = float(judgment.consistency) / 100 + is_equivalent = judgment.equivalence.lower() == "yes" - similarity = float(is_equivalent) - accuracy = float(target_output.lower() == case["expected_summary"].lower()) - consistency = float(is_equivalent) + # Store individual scores + case_scores = { + "input": input_text, + "expected": expected, + "source_output": source_output, + "target_output": target_output, + "accuracy": accuracy_score, + "consistency": consistency_score, + "equivalent": is_equivalent, + "reasoning": judgment.reasoning + } + individual_scores.append(case_scores) - total_similarity += similarity - total_accuracy += accuracy - total_consistency += consistency + # Update totals + total_accuracy += accuracy_score + total_consistency += consistency_score + total_similarity += float(is_equivalent) - print(f"\nJudge's reasoning: {judgment.reasoning}") + print(f"\nEvaluation for case: {input_text[:50]}...") + print(f"Source output: {source_output[:100]}...") + print(f"Target output: {target_output[:100]}...") + print(f"Expected: {expected[:100]}...") + print(f"Judge's reasoning: {judgment.reasoning}") + print(f"Scores - Accuracy: {accuracy_score:.2f}, Consistency: {consistency_score:.2f}, Equivalent: {is_equivalent}") - n = len(test_cases) - return EvaluationMetrics( - accuracy=total_accuracy / n, - similarity=total_similarity / n, - consistency=total_consistency / n + # Calculate final metrics + metrics = EvaluationMetrics( + accuracy=total_accuracy / num_cases, + similarity=total_similarity / num_cases, + consistency=total_consistency / num_cases, + individual_scores=individual_scores ) + + # Save results to JSON + results = { + "source_prompt": source_prompt, + "target_prompt": target_prompt, + "aggregate_metrics": { + "accuracy": metrics.accuracy, + "similarity": metrics.similarity, + "consistency": metrics.consistency + }, + "individual_scores": individual_scores + } + + self._save_results(results) + + + return metrics def evaluate(self, source_prompt: str, target_prompt: str, test_cases: List[Dict]) -> EvaluationMetrics: - """Evaluates the quality of prompt migration using LLM as judge.""" - - metrics = self._calculate_metrics(None, test_cases) # evaluator param not needed anymore + """Evaluates both prompts using the target model.""" + return self._calculate_metrics(source_prompt, target_prompt, test_cases) + + def _save_results(self, results: dict, filename: str = 'results.json') -> None: + """Save results to a JSON file with a new name if the file already exists.""" + # Check if file exists + if os.path.exists(filename): + # Create new filename with timestamp + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + base, ext = os.path.splitext(filename) + filename = f"{base}_{timestamp}{ext}" - return metrics \ No newline at end of file + # Save results + with open(filename, 'w') as f: + json.dump(results, f, indent=2) + print(f"Results saved to {filename}") \ No newline at end of file From 90d16cd7ded383263da7c67dea0a7503abe7ae41 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Thu, 5 Dec 2024 12:54:13 -0800 Subject: [PATCH 8/9] minor changes in eval, deleted formatter --- .../prompt_migration/evaluator.py | 10 +--------- .../prompt_migration/formatters.py | 17 ----------------- 2 files changed, 1 insertion(+), 26 deletions(-) delete mode 100644 recipes/use_cases/prompt-migration/prompt_migration/formatters.py diff --git a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py index 2607e68ca..446007362 100644 --- a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py +++ b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py @@ -71,7 +71,6 @@ def clean_score(score): return result except Exception as e: print(f"Error in judge: {str(e)}") - # Return default scores return type('Result', (), { 'accuracy': '0', 'consistency': '0', @@ -119,12 +118,10 @@ def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: expected_output=expected ) - # Calculate scores accuracy_score = float(judgment.accuracy) / 100 consistency_score = float(judgment.consistency) / 100 is_equivalent = judgment.equivalence.lower() == "yes" - # Store individual scores case_scores = { "input": input_text, "expected": expected, @@ -137,7 +134,6 @@ def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: } individual_scores.append(case_scores) - # Update totals total_accuracy += accuracy_score total_consistency += consistency_score total_similarity += float(is_equivalent) @@ -149,7 +145,6 @@ def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: print(f"Judge's reasoning: {judgment.reasoning}") print(f"Scores - Accuracy: {accuracy_score:.2f}, Consistency: {consistency_score:.2f}, Equivalent: {is_equivalent}") - # Calculate final metrics metrics = EvaluationMetrics( accuracy=total_accuracy / num_cases, similarity=total_similarity / num_cases, @@ -157,7 +152,6 @@ def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: individual_scores=individual_scores ) - # Save results to JSON results = { "source_prompt": source_prompt, "target_prompt": target_prompt, @@ -183,14 +177,12 @@ def evaluate(self, def _save_results(self, results: dict, filename: str = 'results.json') -> None: """Save results to a JSON file with a new name if the file already exists.""" - # Check if file exists + if os.path.exists(filename): - # Create new filename with timestamp timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') base, ext = os.path.splitext(filename) filename = f"{base}_{timestamp}{ext}" - # Save results with open(filename, 'w') as f: json.dump(results, f, indent=2) print(f"Results saved to {filename}") \ No newline at end of file diff --git a/recipes/use_cases/prompt-migration/prompt_migration/formatters.py b/recipes/use_cases/prompt-migration/prompt_migration/formatters.py deleted file mode 100644 index 42c0043bd..000000000 --- a/recipes/use_cases/prompt-migration/prompt_migration/formatters.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import List - -class PromptFormatter: - @staticmethod - def openai_to_llama(prompt: str) -> str: - """Convert OpenAI-style prompts to Llama format.""" - # Basic conversion logic - converted = prompt.replace("{{", "{").replace("}}", "}") - return converted - - @staticmethod - def extract_variables(prompt: str) -> List[str]: - """Extract variable names from a prompt template.""" - import re - pattern = r"\{([^}]+)\}" - matches = re.findall(pattern, prompt) - return list(set(matches)) \ No newline at end of file From 4d75fe97b5354a6c9205250d862102c6aa684f22 Mon Sep 17 00:00:00 2001 From: Justin Lee Date: Wed, 15 Jan 2025 06:48:10 -0800 Subject: [PATCH 9/9] update dir --- .../prompt-migration/.env.template | 0 .../prompt-migration/environment.yml | 0 .../prompt-migration/examples/usage.py | 0 .../use_cases => end-to-end-use-cases}/prompt-migration/main.py | 0 .../prompt-migration/prompt_migration/__init__.py | 0 .../prompt-migration/prompt_migration/engine.py | 0 .../prompt-migration/prompt_migration/eval_dataset.py | 0 .../prompt-migration/prompt_migration/evaluator.py | 0 .../use_cases => end-to-end-use-cases}/prompt-migration/readme.md | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/.env.template (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/environment.yml (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/examples/usage.py (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/main.py (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/prompt_migration/__init__.py (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/prompt_migration/engine.py (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/prompt_migration/eval_dataset.py (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/prompt_migration/evaluator.py (100%) rename {recipes/use_cases => end-to-end-use-cases}/prompt-migration/readme.md (100%) diff --git a/recipes/use_cases/prompt-migration/.env.template b/end-to-end-use-cases/prompt-migration/.env.template similarity index 100% rename from recipes/use_cases/prompt-migration/.env.template rename to end-to-end-use-cases/prompt-migration/.env.template diff --git a/recipes/use_cases/prompt-migration/environment.yml b/end-to-end-use-cases/prompt-migration/environment.yml similarity index 100% rename from recipes/use_cases/prompt-migration/environment.yml rename to end-to-end-use-cases/prompt-migration/environment.yml diff --git a/recipes/use_cases/prompt-migration/examples/usage.py b/end-to-end-use-cases/prompt-migration/examples/usage.py similarity index 100% rename from recipes/use_cases/prompt-migration/examples/usage.py rename to end-to-end-use-cases/prompt-migration/examples/usage.py diff --git a/recipes/use_cases/prompt-migration/main.py b/end-to-end-use-cases/prompt-migration/main.py similarity index 100% rename from recipes/use_cases/prompt-migration/main.py rename to end-to-end-use-cases/prompt-migration/main.py diff --git a/recipes/use_cases/prompt-migration/prompt_migration/__init__.py b/end-to-end-use-cases/prompt-migration/prompt_migration/__init__.py similarity index 100% rename from recipes/use_cases/prompt-migration/prompt_migration/__init__.py rename to end-to-end-use-cases/prompt-migration/prompt_migration/__init__.py diff --git a/recipes/use_cases/prompt-migration/prompt_migration/engine.py b/end-to-end-use-cases/prompt-migration/prompt_migration/engine.py similarity index 100% rename from recipes/use_cases/prompt-migration/prompt_migration/engine.py rename to end-to-end-use-cases/prompt-migration/prompt_migration/engine.py diff --git a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py b/end-to-end-use-cases/prompt-migration/prompt_migration/eval_dataset.py similarity index 100% rename from recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py rename to end-to-end-use-cases/prompt-migration/prompt_migration/eval_dataset.py diff --git a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py b/end-to-end-use-cases/prompt-migration/prompt_migration/evaluator.py similarity index 100% rename from recipes/use_cases/prompt-migration/prompt_migration/evaluator.py rename to end-to-end-use-cases/prompt-migration/prompt_migration/evaluator.py diff --git a/recipes/use_cases/prompt-migration/readme.md b/end-to-end-use-cases/prompt-migration/readme.md similarity index 100% rename from recipes/use_cases/prompt-migration/readme.md rename to end-to-end-use-cases/prompt-migration/readme.md