From b85811d0b9891413485b179731ba67cf42028bb5 Mon Sep 17 00:00:00 2001
From: Justin Lee <justinlee38@outlook.com>
Date: Thu, 5 Dec 2024 11:22:10 -0800
Subject: [PATCH] change eval dataset, include more robust judging, improved
 main

---
 .../prompt-migration/examples/usage.py        |   5 +-
 recipes/use_cases/prompt-migration/main.py    |  62 +++++-
 .../prompt_migration/engine.py                |  66 ++++--
 .../prompt_migration/eval_dataset.py          |  58 ++---
 .../prompt_migration/evaluator.py             | 198 ++++++++++++++----
 5 files changed, 291 insertions(+), 98 deletions(-)

diff --git a/recipes/use_cases/prompt-migration/examples/usage.py b/recipes/use_cases/prompt-migration/examples/usage.py
index 883fa9737..2c88008a5 100644
--- a/recipes/use_cases/prompt-migration/examples/usage.py
+++ b/recipes/use_cases/prompt-migration/examples/usage.py
@@ -16,10 +16,9 @@
     model_type="openai"
 )
 
-# Example evaluation dataset
 eval_dataset = [
-    {"text": "Example text 1", "expected_summary": "Summary 1"},
-    {"text": "Example text 2", "expected_summary": "Summary 2"},
+    {"text": "Example text 1", "expected_answer": "Summary 1"},
+    {"text": "Example text 2", "expected_answer": "Summary 2"},
 ]
 
 # Migrate prompt
diff --git a/recipes/use_cases/prompt-migration/main.py b/recipes/use_cases/prompt-migration/main.py
index 40991468b..83eb1fc46 100644
--- a/recipes/use_cases/prompt-migration/main.py
+++ b/recipes/use_cases/prompt-migration/main.py
@@ -14,17 +14,48 @@ def main():
         api_key=os.getenv("OPENAI_API_KEY")
     )
     
-    # target_lm = dspy.LM(
-    #     model="together_ai/togethercomputer/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
-    #     api_key=os.getenv("TOGETHER_API_KEY")
-    # )
+    target_lm = dspy.LM(
+        model="together_ai/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
+        api_key=os.getenv("TOGETHER_API_KEY")
+    )
+    # To run it with ollama
     # target_lm = dspy.LM('ollama_chat/llama3.2:3b-instruct-fp16', api_base='http://localhost:11434', api_key='')
-    target_lm = dspy.HFModel(model="gpt2")
+
+    # To run it with huggingface
+    # target_lm = dspy.HFModel(model="gpt2")
     
     engine = PromptMigrationEngine(openai_lm, target_lm)
     
     source_prompt = PromptTemplate(
-        template="Write a Python function that takes as input a file path to an image, loads the image into memory as a numpy array, then crops the rows and columns around the perimeter if they are darker than a threshold value. Use the mean value of rows and columns to decide if they should be marked for deletion.",
+        template="""You are an advanced Large Language Model tasked with generating Python code snippets in response to user prompts. Your primary objective is to provide accurate, concise, and well-structured Python functions. Follow these guidelines:
+
+    Understand the Context: Analyze the input prompt and identify its category (e.g., API Usage, File Handling, Error Handling).
+
+    Generate Code:
+        Write Python code that directly addresses the user's request.
+        Ensure the code is syntactically correct, functional, and adheres to Python best practices.
+        Include necessary imports and handle potential edge cases.
+
+    Error Handling:
+        Include appropriate error handling where applicable (e.g., try-except blocks).
+        If exceptions occur, provide meaningful error messages.
+
+    Readability:
+        Use clear variable names and include comments where necessary for clarity.
+        Prioritize readability and maintainability in all generated code.
+
+    Complexity Alignment:
+        Tailor the code's complexity based on the indicated difficulty (e.g., simple, medium, complex).
+        Ensure that the solution is neither overly simplistic nor unnecessarily complicated.
+
+    Prompt Type:
+        Focus on the code_generation type for creating Python functions.
+        Avoid deviating from the task unless additional clarification is requested.
+
+    Testing and Validity:
+        Assume the function might be run immediately. Provide code that is ready for use or minimal adaptation.
+        Highlight any dependencies or external libraries required.
+        """,
         input_variables=["text"],
         model_type="openai"
     )
@@ -33,20 +64,31 @@ def main():
 
 
     # To evaluate on a specific subset, use the following:
-    #summarization_dataset = get_eval_subset(prompt_type="summarization")
+    code_generation_dataset = get_eval_subset(prompt_type="code_generation")
     #simple_tasks = get_eval_subset(complexity="simple")
+    evaluator = PromptEvaluator(openai_lm, target_lm)
+
+    metrics = evaluator.evaluate(
+        source_prompt.template,  # Same prompt for both
+        source_prompt.template,  # Same prompt for both
+        code_generation_dataset
+    )
+    
+    print(f"Evaluation metrics:")
+    print(f"  Accuracy: {metrics.accuracy:.2f}")
+    print(f"  Similarity: {metrics.similarity:.2f}")
+    print(f"  Consistency: {metrics.consistency:.2f}")
     
     # Migrate prompt
     print("Migrating prompt...")
-    migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset)
+    migrated_prompt = engine.migrate_prompt(source_prompt, code_generation_dataset)
     
     # Evaluate migration
     print("Evaluating migration...")
-    evaluator = PromptEvaluator(openai_lm, target_lm)
     metrics = evaluator.evaluate(
         source_prompt.template,
         migrated_prompt.template,
-        eval_dataset
+        code_generation_dataset
     )
     
     print(f"\nResults:")
diff --git a/recipes/use_cases/prompt-migration/prompt_migration/engine.py b/recipes/use_cases/prompt-migration/prompt_migration/engine.py
index a9093cfae..319abc781 100644
--- a/recipes/use_cases/prompt-migration/prompt_migration/engine.py
+++ b/recipes/use_cases/prompt-migration/prompt_migration/engine.py
@@ -9,30 +9,53 @@ class PromptTemplate:
     model_type: str  # 'openai' or 'llama'
 
 class PromptMigrationEngine:
-    def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM):
+    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
         self.source_lm = source_lm
         self.target_lm = target_lm
         dspy.configure(lm=source_lm)
     
     def _optimize_transformation(self, transformer, eval_dataset):
         """Optimize the transformation using the evaluation dataset."""
-        class AccuracyMetric:
+        class PromptQualityMetric:
+            def __init__(self, source_lm, target_lm):
+                self.source_lm = source_lm
+                self.target_lm = target_lm
+            
             def __call__(self, example, prediction, trace=None):
-                return float(prediction.target == example.expected_output)
+                if not hasattr(prediction, 'target'):
+                    return 0.0
+                
+                try:
+                    # Get outputs from both models using the prompts
+                    source_output = self.source_lm(example.source)
+                    target_output = self.target_lm(prediction.target)
+                    
+                    # Compare outputs (basic similarity)
+                    from difflib import SequenceMatcher
+                    similarity = SequenceMatcher(None, 
+                                              str(source_output), 
+                                              str(target_output)).ratio()
+                    return similarity
+                except Exception as e:
+                    print(f"Error in metric: {e}")
+                    return 0.0
         
         optimizer = dspy.BootstrapFewShotWithRandomSearch(
-            metric=AccuracyMetric(),
-            max_bootstrapped_demos=4,
-            max_labeled_demos=4,
-            num_threads=4
+            metric=PromptQualityMetric(self.source_lm, self.target_lm),
+            max_bootstrapped_demos=2,
+            max_labeled_demos=2,
+            num_threads=1
         )
         
-        train_data = [
-            dspy.Example(
+        # Prepare training data
+        train_data = []
+        for item in eval_dataset:
+            # Create example with both prompt and expected output
+            example = dspy.Example(
                 source=item["text"],
-                expected_output=item["expected_summary"]
-            ).with_inputs("source") for item in eval_dataset
-        ]
+                expected_output=item["expected_answer"]
+            ).with_inputs("source")
+            train_data.append(example)
         
         return optimizer.compile(transformer, trainset=train_data)
     
@@ -44,7 +67,7 @@ def migrate_prompt(self,
         class PromptTransformation(dspy.Signature):
             """Convert a prompt from one format to another."""
             source = dspy.InputField(desc="Source prompt template")
-            target = dspy.OutputField(desc="Transformed prompt template")
+            target = dspy.OutputField(desc="Transformed prompt template that maintains functionality while adapting to target model format")
         
         class Transformer(dspy.Module):
             def __init__(self):
@@ -52,7 +75,18 @@ def __init__(self):
                 self.chain = dspy.ChainOfThought(PromptTransformation)
             
             def forward(self, source):
-                return self.chain(source=source)
+                # Add context about the transformation task
+                prompt = f"""
+                Transform this prompt while:
+                1. Maintaining core functionality
+                2. Adapting to target model format
+                3. Preserving input variables
+                4. Keeping essential instructions
+                
+                Source prompt:
+                {source}
+                """
+                return self.chain(source=prompt)
         
         transformer = Transformer()
         
@@ -61,6 +95,10 @@ def forward(self, source):
             
         result = transformer(source=source_prompt.template)
         
+        # Format for target model
+        if source_prompt.model_type == "openai" and "llama" in str(self.target_lm):
+            result.target = f"### Instruction:\n{result.target}\n\n### Response:"
+        
         return PromptTemplate(
             template=result.target,
             input_variables=source_prompt.input_variables,
diff --git a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py
index 8a6c0bfae..c7fae9f14 100644
--- a/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py
+++ b/recipes/use_cases/prompt-migration/prompt_migration/eval_dataset.py
@@ -5,7 +5,7 @@ def get_evaluation_dataset() -> List[Dict]:
     Returns a comprehensive evaluation dataset for testing prompt migrations.
     Each test case includes:
     - text: Input text
-    - expected_summary: Expected output
+    - expected_answer: Expected output
     - prompt_type: Type of prompt (summarization, classification, qa, etc.)
     - complexity: Difficulty level (simple, medium, complex)
     """
@@ -13,7 +13,7 @@ def get_evaluation_dataset() -> List[Dict]:
         # Summarization examples
         {
             "text": "The quick brown fox jumps over the lazy dog.",
-            "expected_summary": "A fox jumps over a dog.",
+            "expected_answer": "A fox jumps over a dog.",
             "prompt_type": "summarization",
             "complexity": "simple"
         },
@@ -21,7 +21,7 @@ def get_evaluation_dataset() -> List[Dict]:
             "text": """Machine learning is a subset of artificial intelligence that focuses on developing 
                    systems that can learn from and make decisions based on data. It has numerous 
                    applications in various fields including healthcare, finance, and autonomous vehicles.""",
-            "expected_summary": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
+            "expected_answer": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
             "prompt_type": "summarization",
             "complexity": "medium"
         },
@@ -29,13 +29,13 @@ def get_evaluation_dataset() -> List[Dict]:
         # Classification examples
         {
             "text": "I absolutely loved this product! Best purchase ever!",
-            "expected_summary": "Positive",
+            "expected_answer": "Positive",
             "prompt_type": "sentiment_classification",
             "complexity": "simple"
         },
         {
             "text": "The product works fine but the customer service could be better.",
-            "expected_summary": "Neutral",
+            "expected_answer": "Neutral",
             "prompt_type": "sentiment_classification",
             "complexity": "medium"
         },
@@ -43,7 +43,7 @@ def get_evaluation_dataset() -> List[Dict]:
         # Question-Answering examples
         {
             "text": "What is the capital of France? Context: Paris is the capital and largest city of France.",
-            "expected_summary": "Paris",
+            "expected_answer": "Paris",
             "prompt_type": "qa",
             "complexity": "simple"
         },
@@ -51,7 +51,7 @@ def get_evaluation_dataset() -> List[Dict]:
             "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets. 
                    Water vapor in warm air rises and cools, forming clouds. When the droplets become too 
                    heavy, they fall as rain.""",
-            "expected_summary": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
+            "expected_answer": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
             "prompt_type": "qa",
             "complexity": "medium"
         },
@@ -59,13 +59,13 @@ def get_evaluation_dataset() -> List[Dict]:
         # Code-related examples
         {
             "text": "Write a function to add two numbers in Python.",
-            "expected_summary": "def add(a, b):\n    return a + b",
+            "expected_answer": "def add(a, b):\n    return a + b",
             "prompt_type": "code_generation",
             "complexity": "simple"
         },
         {
             "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2",
-            "expected_summary": "This code multiplies each element in the array 'arr' by 2.",
+            "expected_answer": "This code multiplies each element in the array 'arr' by 2.",
             "prompt_type": "code_explanation",
             "complexity": "simple"
         },
@@ -73,13 +73,13 @@ def get_evaluation_dataset() -> List[Dict]:
         # Text transformation examples
         {
             "text": "convert this to passive voice: The cat chased the mouse.",
-            "expected_summary": "The mouse was chased by the cat.",
+            "expected_answer": "The mouse was chased by the cat.",
             "prompt_type": "text_transformation",
             "complexity": "simple"
         },
         {
             "text": "translate to French: Hello, how are you?",
-            "expected_summary": "Bonjour, comment allez-vous?",
+            "expected_answer": "Bonjour, comment allez-vous?",
             "prompt_type": "translation",
             "complexity": "simple"
         },
@@ -89,24 +89,24 @@ def get_evaluation_dataset() -> List[Dict]:
             "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves 
                    Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations 
                    are 375 miles apart, at what time will the trains meet?""",
-            "expected_summary": "The trains will meet at 5:00 PM.",
+            "expected_answer": "The trains will meet at 5:00 PM.",
             "prompt_type": "problem_solving",
             "complexity": "complex"
         },
         {
             "text": """Analyze the environmental impact of electric vehicles versus traditional 
                    gasoline vehicles, considering manufacturing, operation, and disposal.""",
-            "expected_summary": """Electric vehicles typically have higher manufacturing emissions but lower 
+            "expected_answer": """Electric vehicles typically have higher manufacturing emissions but lower 
                               operational emissions compared to gasoline vehicles. Overall lifecycle 
                               environmental impact depends on electricity source and battery recycling.""",
             "prompt_type": "analysis",
             "complexity": "complex"
         },
 
-        # Code Generation
+        # Simple Code Generation
         {
             "text": "Write a Python function to check if a number is prime.",
-            "expected_summary": """def is_prime(n):
+            "expected_answer": """def is_prime(n):
     if n < 2:
         return False
     for i in range(2, int(n ** 0.5) + 1):
@@ -118,22 +118,24 @@ def get_evaluation_dataset() -> List[Dict]:
         },
         {
             "text": "Create a Python function to reverse a string.",
-            "expected_summary": """def reverse_string(s):
+            "expected_answer": """def reverse_string(s):
     return s[::-1]""",
             "prompt_type": "code_generation",
             "complexity": "simple"
         },
         
+        # Code Explanation
         {
             "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]",
-            "expected_summary": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.",
+            "expected_answer": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.",
             "prompt_type": "code_explanation",
             "complexity": "medium"
         },
         
+        # Algorithm Implementation
         {
             "text": "Write a Python function to implement binary search.",
-            "expected_summary": """def binary_search(arr, target):
+            "expected_answer": """def binary_search(arr, target):
     left, right = 0, len(arr) - 1
     
     while left <= right:
@@ -150,9 +152,10 @@ def get_evaluation_dataset() -> List[Dict]:
             "complexity": "medium"
         },
         
+        # Data Structure Implementation
         {
             "text": "Implement a Stack class in Python using a list.",
-            "expected_summary": """class Stack:
+            "expected_answer": """class Stack:
     def __init__(self):
         self.items = []
         
@@ -173,9 +176,10 @@ def peek(self):
             "complexity": "medium"
         },
         
+        # Code Debugging
         {
             "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)",
-            "expected_summary": """def factorial(n):
+            "expected_answer": """def factorial(n):
     if n == 0 or n == 1:
         return 1
     return n * factorial(n-1)""",
@@ -183,9 +187,10 @@ def peek(self):
             "complexity": "medium"
         },
         
+        # Code Optimization
         {
             "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n",
-            "expected_summary": """def fibonacci(n):
+            "expected_answer": """def fibonacci(n):
     if n <= 1:
         return n
     a, b = 0, 1
@@ -196,9 +201,10 @@ def peek(self):
             "complexity": "medium"
         },
         
+        # API Usage
         {
             "text": "Write a Python function using requests to fetch data from a REST API endpoint.",
-            "expected_summary": """import requests
+            "expected_answer": """import requests
 
 def fetch_data(url, params=None):
     try:
@@ -212,9 +218,10 @@ def fetch_data(url, params=None):
             "complexity": "medium"
         },
         
+        # File Handling
         {
             "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.",
-            "expected_summary": """import csv
+            "expected_answer": """import csv
 
 def read_csv(file_path):
     data = []
@@ -231,9 +238,10 @@ def read_csv(file_path):
             "complexity": "medium"
         },
         
+        # Error Handling
         {
             "text": "Write a Python function that safely converts a string to integer with error handling.",
-            "expected_summary": """def safe_int_convert(s):
+            "expected_answer": """def safe_int_convert(s):
     try:
         return int(s), None
     except ValueError as e:
@@ -245,7 +253,7 @@ def read_csv(file_path):
         # Complex Algorithm
         {
             "text": "Implement a Python function for Depth-First Search on a graph.",
-            "expected_summary": """def dfs(graph, start, visited=None):
+            "expected_answer": """def dfs(graph, start, visited=None):
     if visited is None:
         visited = set()
     
diff --git a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py
index 33bb09f07..2607e68ca 100644
--- a/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py
+++ b/recipes/use_cases/prompt-migration/prompt_migration/evaluator.py
@@ -1,90 +1,196 @@
-import dspy
+import json
 from typing import List, Dict
 from dataclasses import dataclass
+import dspy
+import os
+from datetime import datetime
 
 @dataclass
 class EvaluationMetrics:
     accuracy: float
     similarity: float
     consistency: float
+    individual_scores: List[Dict]  # Store individual test case scores
 
 class PromptEvaluator:
-    def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM):
+    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
         self.source_lm = source_lm
         self.target_lm = target_lm
+        dspy.configure(lm=source_lm)  # Configure DSPy to use source_lm for judge
         
     def _create_judge(self):
-        """Create an LLM judge to evaluate prompt outputs."""
-        class FactJudge(dspy.Signature):
-            """Judge if the migrated prompt produces equivalent outputs."""
-            source_output = dspy.InputField(desc="Output from source model")
-            target_output = dspy.InputField(desc="Output from target model")
-            factually_correct = dspy.OutputField(
-                desc="Is the target output equivalent to the source output in terms of content and intent?",
-                prefix="Factual[Yes/No]:"
+        """Create an LLM judge to evaluate outputs."""
+        class OutputJudge(dspy.Signature):
+            """Judge the quality and equivalence of outputs."""
+            input_text = dspy.InputField(desc="The coding task")
+            source_output = dspy.InputField(desc="Output from source prompt")
+            target_output = dspy.InputField(desc="Output from target prompt")
+            expected_output = dspy.InputField(desc="Expected output from dataset")
+            
+            equivalence = dspy.OutputField(
+                desc="Are the outputs functionally equivalent to the expected output? Answer ONLY with 'yes' or 'no'."
+            )
+            accuracy = dspy.OutputField(
+                desc="Rate how well the outputs match the expected output. Provide ONLY a number between 0 and 100, no text."
             )
-            reasoning = dspy.OutputField(desc="Explanation for the judgment")
+            consistency = dspy.OutputField(
+                desc="Rate how consistent the outputs are with each other. Provide ONLY a number between 0 and 100, no text."
+            )
+            reasoning = dspy.OutputField(
+                desc="Explain your evaluation, focusing on functionality and correctness."
+            )
+
+        class Judge(dspy.Module):
+            def __init__(self):
+                super().__init__()
+                self.judge = dspy.ChainOfThought(OutputJudge)
+            
+            def forward(self, input_text, source_output, target_output, expected_output):
+                try:
+                    result = self.judge(
+                        input_text=input_text,
+                        source_output=source_output,
+                        target_output=target_output,
+                        expected_output=expected_output
+                    )
+                    
+                    # Ensure numeric scores
+                    def clean_score(score):
+                        try:
+                            # Extract just numbers
+                            import re
+                            numbers = re.findall(r'\d+', str(score))
+                            return float(numbers[0]) if numbers else 0.0
+                        except:
+                            return 0.0
+                    
+                    result.accuracy = clean_score(result.accuracy)
+                    result.consistency = clean_score(result.consistency)
+                    result.equivalence = str(result.equivalence).lower().strip()
+                    
+                    return result
+                except Exception as e:
+                    print(f"Error in judge: {str(e)}")
+                    # Return default scores
+                    return type('Result', (), {
+                        'accuracy': '0',
+                        'consistency': '0',
+                        'equivalence': 'no',
+                        'reasoning': f'Error in evaluation: {str(e)}'
+                    })()
 
-        return dspy.ChainOfThought(FactJudge)
+        return Judge()
 
-    def _get_model_output(self, model, text: str) -> str:
-        """Helper function to get output from different model types."""
+    def _get_model_output(self, prompt: str, input_text: str) -> str:
+        """Get output from target model using the provided prompt."""
         try:
-            # Try different methods since DSPy model interfaces can vary
-            if hasattr(model, '__call__'):
-                return model(text)
-            elif hasattr(model, 'generate'):
-                return model.generate(text)
-            elif hasattr(model, 'complete'):
-                return model.complete(text)
-            else:
-                raise AttributeError(f"Model {type(model)} has no supported generation method")
+            formatted_prompt = prompt.format(text=input_text)
+            response = self.target_lm(formatted_prompt)
+            
+            if isinstance(response, list):
+                return response[0] if response else ""
+            return str(response)
         except Exception as e:
-            print(f"Error generating output with {type(model)}: {str(e)}")
+            print(f"Error generating output: {str(e)}")
             return ""
 
-    def _calculate_metrics(self, evaluator, test_cases):
-        """Calculate evaluation metrics using LLM as judge."""
+    def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: List[Dict]) -> EvaluationMetrics:
+        """Calculate evaluation metrics using target model for both prompts."""
         total_similarity = 0.0
         total_accuracy = 0.0
         total_consistency = 0.0
+        individual_scores = []
         
         judge = self._create_judge()
+        num_cases = len(test_cases)
         
         for case in test_cases:
-            source_output = self._get_model_output(self.source_lm, case["text"])
-            target_output = self._get_model_output(self.target_lm, case["text"])
+            input_text = case["text"]
+            expected = case["expected_answer"]
+            
+            # Get outputs from target model using both prompts
+            source_output = self._get_model_output(source_prompt, input_text)
+            target_output = self._get_model_output(target_prompt, input_text)
             
             judgment = judge(
+                input_text=input_text,
                 source_output=source_output,
-                target_output=target_output
+                target_output=target_output,
+                expected_output=expected
             )
             
-            is_equivalent = judgment.factually_correct.lower() == "yes"
+            # Calculate scores
+            accuracy_score = float(judgment.accuracy) / 100
+            consistency_score = float(judgment.consistency) / 100
+            is_equivalent = judgment.equivalence.lower() == "yes"
             
-            similarity = float(is_equivalent)
-            accuracy = float(target_output.lower() == case["expected_summary"].lower())
-            consistency = float(is_equivalent)
+            # Store individual scores
+            case_scores = {
+                "input": input_text,
+                "expected": expected,
+                "source_output": source_output,
+                "target_output": target_output,
+                "accuracy": accuracy_score,
+                "consistency": consistency_score,
+                "equivalent": is_equivalent,
+                "reasoning": judgment.reasoning
+            }
+            individual_scores.append(case_scores)
             
-            total_similarity += similarity
-            total_accuracy += accuracy
-            total_consistency += consistency
+            # Update totals
+            total_accuracy += accuracy_score
+            total_consistency += consistency_score
+            total_similarity += float(is_equivalent)
             
-            print(f"\nJudge's reasoning: {judgment.reasoning}")
+            print(f"\nEvaluation for case: {input_text[:50]}...")
+            print(f"Source output: {source_output[:100]}...")
+            print(f"Target output: {target_output[:100]}...")
+            print(f"Expected: {expected[:100]}...")
+            print(f"Judge's reasoning: {judgment.reasoning}")
+            print(f"Scores - Accuracy: {accuracy_score:.2f}, Consistency: {consistency_score:.2f}, Equivalent: {is_equivalent}")
         
-        n = len(test_cases)
-        return EvaluationMetrics(
-            accuracy=total_accuracy / n,
-            similarity=total_similarity / n,
-            consistency=total_consistency / n
+        # Calculate final metrics
+        metrics = EvaluationMetrics(
+            accuracy=total_accuracy / num_cases,
+            similarity=total_similarity / num_cases,
+            consistency=total_consistency / num_cases,
+            individual_scores=individual_scores
         )
+        
+        # Save results to JSON
+        results = {
+            "source_prompt": source_prompt,
+            "target_prompt": target_prompt,
+            "aggregate_metrics": {
+                "accuracy": metrics.accuracy,
+                "similarity": metrics.similarity,
+                "consistency": metrics.consistency
+            },
+            "individual_scores": individual_scores
+        }
+        
+        self._save_results(results)
+
+        
+        return metrics
     
     def evaluate(self, 
                 source_prompt: str, 
                 target_prompt: str, 
                 test_cases: List[Dict]) -> EvaluationMetrics:
-        """Evaluates the quality of prompt migration using LLM as judge."""
-        
-        metrics = self._calculate_metrics(None, test_cases)  # evaluator param not needed anymore
+        """Evaluates both prompts using the target model."""
+        return self._calculate_metrics(source_prompt, target_prompt, test_cases)
+    
+    def _save_results(self, results: dict, filename: str = 'results.json') -> None:
+        """Save results to a JSON file with a new name if the file already exists."""
+        # Check if file exists
+        if os.path.exists(filename):
+            # Create new filename with timestamp
+            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+            base, ext = os.path.splitext(filename)
+            filename = f"{base}_{timestamp}{ext}"
         
-        return metrics
\ No newline at end of file
+        # Save results
+        with open(filename, 'w') as f:
+            json.dump(results, f, indent=2)
+        print(f"Results saved to {filename}")
\ No newline at end of file