meta-llama · heyjustinai · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@ __pycache__
 .ipynb_checkpoints
 wandb/
 artifacts/
+
+**/.env
diff --git a/end-to-end-use-cases/prompt-migration/README.md b/end-to-end-use-cases/prompt-migration/README.md
@@ -0,0 +1,77 @@
+
+# Prompt Migration
+
+## Overview
+
+The **Prompt Migration** toolkit helps you assess and adapt prompts across different language models, ensuring consistent performance and reliability. It includes benchmarking capabilities and evaluation tools to measure the effectiveness of prompt migrations.
+
+## Project Structure
+
+- `notebooks/`: Contains Jupyter notebooks for interactive prompt migration examples
+  - `harness.ipynb`: Main notebook demonstrating the prompt migration workflow
+- `benchmarks/`: Tools and scripts for performance evaluation
+- `environment.yml`: Conda environment specification with all required dependencies
+
+## Prerequisites
+
+1. **Conda Environment**
+   - [Miniconda](https://docs.conda.io/en/latest/miniconda.html) or [Anaconda](https://www.anaconda.com/) installed
+   - Python 3.10
+   - Create and activate the environment:
+     ```bash
+     conda env create -f environment.yml
+     conda activate prompt-migration
+     ```
+
+2. **Setting Up vLLM for Inference**
+   If you plan to use [vLLM](https://github.com/vllm-project/vllm) for model inference:
+   ```bash
+   pip install vllm
+   ```
+   To serve a large model (example: Meta’s Llama 3.3 70B Instruct), you might run:
+   ```bash
+   vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size=4
+   ```
+   Adjust the model name and `--tensor-parallel-size` according to your hardware and parallelization needs.
+
+3. **Accessing Hugging Face Datasets**
+   If you need to work with private or gated Hugging Face datasets, follow these steps:
+   1. **Create a Hugging Face account (if you don’t have one):**
+      Visit [Hugging Face](https://huggingface.co/) and create an account.
+   2. **Authenticate via the Hugging Face CLI:**
+      - Install the Hugging Face Hub CLI:
+        ```bash
+        pip install huggingface_hub
+        ```
+      - Log in to Hugging Face:
+        ```bash
+        huggingface-cli login
+        ```
+      - Enter your Hugging Face credentials (username and token). You can generate or retrieve your token in your [Hugging Face settings](https://huggingface.co/settings/tokens).
+   3. **Check Dataset Permissions:**
+      Some datasets may require explicit permission from the dataset owner. If you continue to have access issues, visit the dataset page on Hugging Face to request or confirm your access rights.
+
+## Key Dependencies
+
+- **DSPy**: For prompt engineering and evaluation
+- **LM-eval**: Evaluation framework for language models
+- **PyTorch** and **Transformers**: For model inference
+
+## Getting Started
+
+1. **Activate your environment:**
+   ```bash
+   conda activate prompt-migration
+   ```
+2. **Start Jupyter notebook server:**
+   ```bash
+   jupyter notebook
+   ```
+3. **Open the main notebook:**
+   Navigate to the `notebooks/harness.ipynb` in your browser to get started.
+4. **Explore Benchmarks:**
+   Use the scripts in the `benchmarks/` directory to evaluate your prompt migrations.
+
+## License
+
+This project is part of the **Llama Recipes** collection. Please refer to the main repository’s license for usage terms.
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/__init__.py b/end-to-end-use-cases/prompt-migration/benchmarks/__init__.py
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py b/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py
@@ -0,0 +1,10 @@
+import typing as t
+
+if t.TYPE_CHECKING:
+    import dspy
+
+
+class TaskDatasets(t.NamedTuple):
+    trainset: t.Iterable["dspy.Example"]
+    valset: t.Iterable["dspy.Example"]
+    testset: t.Iterable["dspy.Example"]
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py b/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py
@@ -0,0 +1,25 @@
+import typing as t
+
+from .datatypes import TaskDatasets
+
+if t.TYPE_CHECKING:
+    from datasets import Dataset
+    import dspy
+
+
+def train_val_test_split(
+    dataset: "Dataset",
+    mapper: t.Callable[[dict], "dspy.Example"],
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    docs = dataset.train_test_split(train_size=train_size)
+    train_docs = docs["train"]
+    docs = docs["test"].train_test_split(train_size=validation_size)
+    validation_docs = docs["train"]
+    test_docs = docs["test"]
+    return TaskDatasets(
+        trainset=list(map(mapper, train_docs)),
+        valset=list(map(mapper, validation_docs)),
+        testset=list(map(mapper, test_docs)),
+    )
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py b/end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py
@@ -0,0 +1,77 @@
+import typing as t
+
+from bigcode_eval.tasks import humaneval
+from bigcode_eval.tasks.custom_metrics.execute import check_correctness
+from datasets import load_dataset
+from lm_eval.evaluator_utils import eval_logger
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+if t.TYPE_CHECKING:
+    from bigcode_eval.base import Task
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class HumanEval(dspy.Signature):
+        __doc__ = instructions
+        prompt: str = dspy.InputField()
+        solution: str = dspy.OutputField()
+
+    return HumanEval
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    program = gold.prompt + "\n" + pred.solution + "\n" + gold.dspy_test
+    result = check_correctness(
+        program,
+        timeout=30,
+        task_id=gold.dspy_task_id,
+        completion_id=None,
+    )
+
+    if result["passed"]:
+        return True
+
+    eval_logger.debug(f"{gold.dspy_task_id}: {result['result']}")
+    return False
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("codeparrot/instructhumaneval")
+    train_docs, validation_docs, test_docs = train_val_test_split(
+        dataset,
+        train_size=train_size,
+        validation_size=validation_size,
+    )
+
+    return TaskDatasets(
+        trainset=map(_task_doc_example, train_docs),
+        valset=map(_task_doc_example, validation_docs),
+        testset=map(_task_doc_example, test_docs),
+    )
+
+
+class TaskDoc(t.TypedDict):
+    task_id: str
+    prompt: str
+    canonical_solution: str
+    test: str
+
+
+inputs = ["prompt"]
+outputs = ["solution"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    return dspy.Example(
+        prompt=doc["prompt"],
+        solution=doc["canonical_solution"],
+        # dspy_ keys are hidden
+        dspy_task_id=doc["task_id"],
+        dspy_test=doc["test"],
+    ).with_inputs(*inputs)
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py b/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py
@@ -0,0 +1,61 @@
+import typing as t
+
+from datasets import load_dataset
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLUPro(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField()
+        options: list[str] = dspy.InputField()
+        answer: str = dspy.OutputField()
+
+    return MMLUPro
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+    return train_val_test_split(
+        dataset["test"], _task_doc_example, train_size, validation_size
+    )
+
+
+class TaskDoc(t.TypedDict):
+    question_id: int
+    question: str
+    options: list[str]
+    answer: str
+    answer_index: int
+    cot_content: str
+    category: str
+    src: str
+
+
+inputs = ["question", "options"]
+outputs = ["answer"]
+
+
+def _num_letter(n: int) -> str:
+    return chr(ord("A") + n)
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    question = doc["question"]
+    options = [f"{_num_letter(i)}. {option}" for i, option in enumerate(doc["options"])]
+    answer = doc["answer"]
+    return dspy.Example(
+        question=question,
+        options=options,
+        answer=answer,
+    ).with_inputs(*inputs)
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
@@ -0,0 +1,76 @@
+import typing as t
+
+import dspy
+
+from datasets import load_dataset
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    """
+    Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
+    """
+    dataset = load_dataset(
+        "meta-llama/Llama-3.3-70B-Instruct-evals",
+        "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
+    )
+    return train_val_test_split(
+        dataset["latest"],
+        _task_doc_example,
+        train_size,
+        validation_size,
+    )
+
+
+class TaskDoc(t.TypedDict):
+    task_type: str
+    task_name: str
+    subtask_name: str
+    input_question: str
+    input_choice_list: dict
+    input_final_prompts: list
+    input_correct_responses: list
+    output_prediction_text: list
+    output_parsed_answer: str
+    output_choice_completions: t.Optional[dict]
+    output_choice_negative_log_likelihoods: t.Optional[dict]
+    output_metrics: dict
+    is_correct: bool
+    input_question_hash: str
+    input_final_prompts_hash: list
+    benchmark_label: str
+    eval_config: dict
+
+
+inputs = ["input_question", "input_choice_list"]
+outputs = ["output_parsed_answer"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    example = dspy.Example(
+        question=doc["input_question"],
+        options=doc["input_choice_list"],
+        answer=doc["output_parsed_answer"],
+    )
+    example._input_keys = {"question", "options"}
+    example._output_keys = {"answer"}
+    return example
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLUPro(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField(desc="The question to be answered")
+        options: dict = dspy.InputField(desc="Dictionary of answer choices")
+        answer: str = dspy.OutputField(desc="The correct answer letter")
+
+    return MMLUPro
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer
diff --git a/end-to-end-use-cases/prompt-migration/environment.yml b/end-to-end-use-cases/prompt-migration/environment.yml
@@ -0,0 +1,25 @@
+name: prompt-migration
+channels:
+  - defaults
+  - pytorch
+  - conda-forge
+dependencies:
+  - python=3.10  # Updated to match pyproject.toml requires-python
+  - pip
+  - numpy<2  # Matches pyproject.toml dependency
+  - pip:
+      - dspy @ git+ssh://[email protected]/stanfordnlp/dspy.git
+      - lm-eval[wandb,api,math,ifeval,sentencepiece]>=0.4.7
+      - python-dotenv>=1.0.1
+      - ipdb>=0.13.13
+      - ipython>=8.31.0
+      - pytest>=8.3.4
+      - ruff>=0.9.1
+      - ipykernel>=6.29.5
+      - torch
+      - transformers
+      - openai
+      - databricks-sdk
+      - bigcode_eval @ [email protected]:bigcode-project/bigcode-evaluation-harness.git
+      - python-dotenv
+      - dspy
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ __pycache__ @@
     .ipynb_checkpoints
     wandb/
     artifacts/
+    **/.env