change eval dataset, include more robust judging, improved main

meta-llama · Jan 15, 2025 · b85811d · b85811d
1 parent 43a2cbc
commit b85811d
Show file tree

Hide file tree

Showing 5 changed files with 291 additions and 98 deletions.
diff --git a/recipes/use_cases/prompt-migration/examples/usage.py b/recipes/use_cases/prompt-migration/examples/usage.py
@@ -16,10 +16,9 @@
     model_type="openai"
 )
 
-# Example evaluation dataset
 eval_dataset = [
-    {"text": "Example text 1", "expected_summary": "Summary 1"},
-    {"text": "Example text 2", "expected_summary": "Summary 2"},
+    {"text": "Example text 1", "expected_answer": "Summary 1"},
+    {"text": "Example text 2", "expected_answer": "Summary 2"},
 ]
 
 # Migrate prompt

diff --git a/recipes/use_cases/prompt-migration/main.py b/recipes/use_cases/prompt-migration/main.py
@@ -14,17 +14,48 @@ def main():
         api_key=os.getenv("OPENAI_API_KEY")
     )
 
-    # target_lm = dspy.LM(
-    #     model="together_ai/togethercomputer/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
-    #     api_key=os.getenv("TOGETHER_API_KEY")
-    # )
+    target_lm = dspy.LM(
+        model="together_ai/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
+        api_key=os.getenv("TOGETHER_API_KEY")
+    )
+    # To run it with ollama
     # target_lm = dspy.LM('ollama_chat/llama3.2:3b-instruct-fp16', api_base='http://localhost:11434', api_key='')
-    target_lm = dspy.HFModel(model="gpt2")
+
+    # To run it with huggingface
+    # target_lm = dspy.HFModel(model="gpt2")
 
     engine = PromptMigrationEngine(openai_lm, target_lm)
 
     source_prompt = PromptTemplate(
-        template="Write a Python function that takes as input a file path to an image, loads the image into memory as a numpy array, then crops the rows and columns around the perimeter if they are darker than a threshold value. Use the mean value of rows and columns to decide if they should be marked for deletion.",
+        template="""You are an advanced Large Language Model tasked with generating Python code snippets in response to user prompts. Your primary objective is to provide accurate, concise, and well-structured Python functions. Follow these guidelines:
+
+    Understand the Context: Analyze the input prompt and identify its category (e.g., API Usage, File Handling, Error Handling).
+
+    Generate Code:
+        Write Python code that directly addresses the user's request.
+        Ensure the code is syntactically correct, functional, and adheres to Python best practices.
+        Include necessary imports and handle potential edge cases.
+
+    Error Handling:
+        Include appropriate error handling where applicable (e.g., try-except blocks).
+        If exceptions occur, provide meaningful error messages.
+
+    Readability:
+        Use clear variable names and include comments where necessary for clarity.
+        Prioritize readability and maintainability in all generated code.
+
+    Complexity Alignment:
+        Tailor the code's complexity based on the indicated difficulty (e.g., simple, medium, complex).
+        Ensure that the solution is neither overly simplistic nor unnecessarily complicated.
+
+    Prompt Type:
+        Focus on the code_generation type for creating Python functions.
+        Avoid deviating from the task unless additional clarification is requested.
+
+    Testing and Validity:
+        Assume the function might be run immediately. Provide code that is ready for use or minimal adaptation.
+        Highlight any dependencies or external libraries required.
+        """,
         input_variables=["text"],
         model_type="openai"
     )
@@ -33,20 +64,31 @@ def main():
 
 
     # To evaluate on a specific subset, use the following:
-    #summarization_dataset = get_eval_subset(prompt_type="summarization")
+    code_generation_dataset = get_eval_subset(prompt_type="code_generation")
     #simple_tasks = get_eval_subset(complexity="simple")
+    evaluator = PromptEvaluator(openai_lm, target_lm)
+
+    metrics = evaluator.evaluate(
+        source_prompt.template,  # Same prompt for both
+        source_prompt.template,  # Same prompt for both
+        code_generation_dataset
+    )
+
+    print(f"Evaluation metrics:")
+    print(f"  Accuracy: {metrics.accuracy:.2f}")
+    print(f"  Similarity: {metrics.similarity:.2f}")
+    print(f"  Consistency: {metrics.consistency:.2f}")
 
     # Migrate prompt
     print("Migrating prompt...")
-    migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset)
+    migrated_prompt = engine.migrate_prompt(source_prompt, code_generation_dataset)
 
     # Evaluate migration
     print("Evaluating migration...")
-    evaluator = PromptEvaluator(openai_lm, target_lm)
     metrics = evaluator.evaluate(
         source_prompt.template,
         migrated_prompt.template,
-        eval_dataset
+        code_generation_dataset
     )
 
     print(f"\nResults:")

diff --git a/recipes/use_cases/prompt-migration/prompt_migration/engine.py b/recipes/use_cases/prompt-migration/prompt_migration/engine.py
@@ -9,30 +9,53 @@ class PromptTemplate:
     model_type: str  # 'openai' or 'llama'
 
 class PromptMigrationEngine:
-    def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM):
+    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
         self.source_lm = source_lm
         self.target_lm = target_lm
         dspy.configure(lm=source_lm)
 
     def _optimize_transformation(self, transformer, eval_dataset):
         """Optimize the transformation using the evaluation dataset."""
-        class AccuracyMetric:
+        class PromptQualityMetric:
+            def __init__(self, source_lm, target_lm):
+                self.source_lm = source_lm
+                self.target_lm = target_lm
+
             def __call__(self, example, prediction, trace=None):
-                return float(prediction.target == example.expected_output)
+                if not hasattr(prediction, 'target'):
+                    return 0.0
+
+                try:
+                    # Get outputs from both models using the prompts
+                    source_output = self.source_lm(example.source)
+                    target_output = self.target_lm(prediction.target)
+
+                    # Compare outputs (basic similarity)
+                    from difflib import SequenceMatcher
+                    similarity = SequenceMatcher(None, 
+                                              str(source_output), 
+                                              str(target_output)).ratio()
+                    return similarity
+                except Exception as e:
+                    print(f"Error in metric: {e}")
+                    return 0.0
 
         optimizer = dspy.BootstrapFewShotWithRandomSearch(
-            metric=AccuracyMetric(),
-            max_bootstrapped_demos=4,
-            max_labeled_demos=4,
-            num_threads=4
+            metric=PromptQualityMetric(self.source_lm, self.target_lm),
+            max_bootstrapped_demos=2,
+            max_labeled_demos=2,
+            num_threads=1
         )
 
-        train_data = [
-            dspy.Example(
+        # Prepare training data
+        train_data = []
+        for item in eval_dataset:
+            # Create example with both prompt and expected output
+            example = dspy.Example(
                 source=item["text"],
-                expected_output=item["expected_summary"]
-            ).with_inputs("source") for item in eval_dataset
-        ]
+                expected_output=item["expected_answer"]
+            ).with_inputs("source")
+            train_data.append(example)
 
         return optimizer.compile(transformer, trainset=train_data)
 
@@ -44,15 +67,26 @@ def migrate_prompt(self,
         class PromptTransformation(dspy.Signature):
             """Convert a prompt from one format to another."""
             source = dspy.InputField(desc="Source prompt template")
-            target = dspy.OutputField(desc="Transformed prompt template")
+            target = dspy.OutputField(desc="Transformed prompt template that maintains functionality while adapting to target model format")
 
         class Transformer(dspy.Module):
             def __init__(self):
                 super().__init__()
                 self.chain = dspy.ChainOfThought(PromptTransformation)
 
             def forward(self, source):
-                return self.chain(source=source)
+                # Add context about the transformation task
+                prompt = f"""
+                Transform this prompt while:
+                1. Maintaining core functionality
+                2. Adapting to target model format
+                3. Preserving input variables
+                4. Keeping essential instructions
+                
+                Source prompt:
+                {source}
+                """
+                return self.chain(source=prompt)
 
         transformer = Transformer()
 
@@ -61,6 +95,10 @@ def forward(self, source):
 
         result = transformer(source=source_prompt.template)
 
+        # Format for target model
+        if source_prompt.model_type == "openai" and "llama" in str(self.target_lm):
+            result.target = f"### Instruction:\n{result.target}\n\n### Response:"
+
         return PromptTemplate(
             template=result.target,
             input_variables=source_prompt.input_variables,