pnnl · davisrbr · Jan 1, 2025 · Jan 2, 2025 · Jan 4, 2025 · Jan 4, 2025
diff --git a/llm_evals/Dockerfile b/llm_evals/Dockerfile
@@ -0,0 +1,33 @@
+# Dockerfile for a custom image that includes numpy, sympy, and sage
+FROM python:3.12-bookworm
+
+# Install any packages you need
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Miniforge (recommended for SageMath installation)
+RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-$(uname -m).sh" \
+    && bash Miniforge3-Linux-$(uname -m).sh -b -p /opt/conda \
+    && rm Miniforge3-Linux-$(uname -m).sh
+
+# Add conda to path
+ENV PATH="/opt/conda/bin:${PATH}"
+
+# Create and activate environment with sage
+RUN conda create -n sage -c conda-forge sage python=3.9 -y \
+    && conda run -n sage pip install --no-cache-dir numpy sympy \
+    && echo "conda activate sage" >> ~/.bashrc
+
+# Set the default shell to bash and ensure conda environment is activated
+SHELL ["/bin/bash", "--login", "-c"]
+
+# Set the default conda environment
+ENV CONDA_DEFAULT_ENV=sage
+ENV CONDA_PREFIX=/opt/conda/envs/sage
+ENV PATH="/opt/conda/envs/sage/bin:${PATH}"
+
+# Keep the container alive during Inspect's evaluation
+CMD ["tail", "-f", "/dev/null"] 
diff --git a/llm_evals/README.md b/llm_evals/README.md
@@ -0,0 +1,49 @@
+# LLM Evaluations for Combinatorics Datasets
+
+This directory contains evaluation scripts for testing language models on the algebraic combinatorics datasets using the inspect-ai framework. The evaluations consist of an in-context learning task and a program synthesis task.
+
+## Overview
+
+The evaluation suite provides two main tasks:
+
+1. **In-Context Classification** (`algcomb_classification`):
+   - Tests direct classification, ie simple prompt → label evaluation.
+   - Optionally uses few-shot examples (pretty necessary to get good results) and chain-of-thought.
+
+2. **Program Synthesis** (`algcomb_program_synthesis`):
+   - The language model is given the task of generating a Python function to solve the classification task.
+   - Evaluates generated code in a sandboxed Docker environment (gives the LM the option to use numpy, sympy, and sage-- see Dockerfile).
+   - Optionally uses few-shot examples and chain-of-thought.
+
+## Usage Examples
+
+### In-Context Classification 
+On the weaving patterns dataset, with chain-of-thought reasoning and 25 few-shot examples:
+```
+inspect eval llm_evaluation.py@algcomb_classification -T --dataset=weaving -T --n=6 -T --few_shot_count=25 -T --use_chain_of_thought=true --cache-prompt=true --model=openai/gpt-4
+```
+
+### Program Synthesis
+On the weaving patterns dataset, with chain-of-thought reasoning and 25 few-shot examples:
+```
+inspect eval llm_evaluation.py@algcomb_program_synthesis -T --dataset=weaving -T --n=6 -T --model=anthropic/claude-2 --cache-prompt=true --few_shot_count=25 --use_chain_of_thought=true
+```
+
+#### Notes:
+- You can control the number of programs generated in two ways: either by varying the number of epochs or by varying the number of test samples. Either way, the solver will evaluate the program on the entire test set (this is cheap because it is just a python program!). The difference is that varying the test samples will prompt the model with a different test sample for each program call. However, the scoring is a max over the epochs, so the scoring displayed when you run `inspect view --log-dir logs` in you command line will only be correct if you run with max_test_samples=1 (the default) and varying the number of epochs. You can recover the correct scoring when you run with different test samples by just examining the logs (either sorting by sample when running `inspect view` or by parsing the csv). Hopefully, this will be fixed in the future.
+- For the o1-series of models, you will probably want to increase max_tokens.
+
+## Task Options
+
+- Few-shot learning support (configurable number of training examples)
+- Chain-of-thought prompting
+- Sample size for testing
+- (Very high-level) dataset information included in the prompts
+- Provider-level caching (very useful when using a large number of in-context examples).
+
+## Requirements
+
+- inspect-ai (https://inspect.ai-safety-institute.org.uk/)
+- Docker (for program synthesis evaluation)
+- Access keys for LLM providers (e.g., OpenAI, Anthropic)
+- The downloaded datasets (see load_datasets.py and how_to_load_datasets.ipynb)
diff --git a/llm_evals/compose.yaml b/llm_evals/compose.yaml
@@ -0,0 +1,8 @@
+services:
+  default:
+    build: .
+    init: true
+    command: tail -f /dev/null
+    network_mode: none
+    cpus: 1.0
+    mem_limit: 3gb 
diff --git a/llm_evals/experiment_runner.py b/llm_evals/experiment_runner.py
@@ -0,0 +1,88 @@
+"""
+Experiment runner for tasks defined in llm_evaluation.py.
+"""
+
+from inspect_ai import eval
+from llm_evaluation import algcomb_classification, algcomb_program_synthesis
+import os
+from plot_per_dataset_n import parse_inspect_logs_from_dir
+
+
+# Define your configuration values below:
+dataset_n_values = {
+    "schubert": [3, 4, 5],
+    # "weaving": [6, 7],
+}
+few_shot_counts = (0, 10, 30, 50, 100)
+models = ["openai/gpt-4o-mini", "openai/gpt-4o", "anthropic/claude-3-5-sonnet-latest", "openai/o1-mini"]
+# solver = "algcomb_classification"
+solver = "algcomb_program_synthesis"
+use_chain_of_thought = [True, False]
+
+tasks_to_eval = []
+
+def was_evaluated(
+    dataset: str,
+    n_val: int,
+    few_shot: int,
+    model: str,
+    chain_of_thought: bool,
+    log_dir: str
+) -> bool:
+    """
+    Returns True if logs in log_dir contain a record matching the given
+    (dataset, n, few_shot, model, use_chain_of_thought).
+    """
+    if not os.path.isdir(log_dir):
+        return False
+
+    df = parse_inspect_logs_from_dir(log_dir)
+    subset = df[
+        (df["dataset"] == dataset)
+        & (df["n"] == n_val)
+        & (df["few_shot"] == few_shot)
+        & (df["model"] == model)
+        & (df["use_chain_of_thought"] == chain_of_thought)
+    ]
+    return not subset.empty
+
+skip_if_existing = True 
+log_dir_base = "schubert_logs_algcomb_program_synthesis/"
+
+for dataset, n_list in dataset_n_values.items():
+    for model in models:
+        for n in n_list:
+            for few_shot in few_shot_counts:
+                for cot in use_chain_of_thought:
+                    if skip_if_existing:
+                        if was_evaluated(dataset, n, few_shot, model, cot, log_dir=f"{dataset}_logs_{solver}"):
+                            print("Skipping this configuration.")
+                            continue
+
+                    if solver == "algcomb_classification":
+                        t = algcomb_classification(
+                            dataset=dataset,
+                            n=n,
+                            few_shot_count=few_shot,
+                            include_dataset_info=True,
+                            max_samples=200,
+                            use_chain_of_thought=cot
+                        )
+                    elif solver == "algcomb_program_synthesis":
+                        t = algcomb_program_synthesis(
+                            dataset=dataset,
+                            n=n,
+                            few_shot_count=few_shot,
+                            include_dataset_info=True,
+                            use_chain_of_thought=cot,
+                            epochs=100,
+                            max_tokens=15000  # this need to be high for the o1 models because they use a lot of tokens for this task
+                        )
+                    else:
+                        raise ValueError(f"Solver '{solver}' not recognized.")
+
+                    tasks_to_eval.append(t)
+
+        logs = eval(tasks_to_eval, model=model, log_dir=f"{dataset}_logs_{solver}", log_level="warning")
+
+print("Evaluation complete. Logs returned:", logs)