diff --git a/llm_evals/Dockerfile b/llm_evals/Dockerfile
new file mode 100644
index 0000000..47ee02e
--- /dev/null
+++ b/llm_evals/Dockerfile
@@ -0,0 +1,33 @@
+# Dockerfile for a custom image that includes numpy, sympy, and sage
+FROM python:3.12-bookworm
+
+# Install any packages you need
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Miniforge (recommended for SageMath installation)
+RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-$(uname -m).sh" \
+    && bash Miniforge3-Linux-$(uname -m).sh -b -p /opt/conda \
+    && rm Miniforge3-Linux-$(uname -m).sh
+
+# Add conda to path
+ENV PATH="/opt/conda/bin:${PATH}"
+
+# Create and activate environment with sage
+RUN conda create -n sage -c conda-forge sage python=3.9 -y \
+    && conda run -n sage pip install --no-cache-dir numpy sympy \
+    && echo "conda activate sage" >> ~/.bashrc
+
+# Set the default shell to bash and ensure conda environment is activated
+SHELL ["/bin/bash", "--login", "-c"]
+
+# Set the default conda environment
+ENV CONDA_DEFAULT_ENV=sage
+ENV CONDA_PREFIX=/opt/conda/envs/sage
+ENV PATH="/opt/conda/envs/sage/bin:${PATH}"
+
+# Keep the container alive during Inspect's evaluation
+CMD ["tail", "-f", "/dev/null"] 
\ No newline at end of file
diff --git a/llm_evals/README.md b/llm_evals/README.md
new file mode 100644
index 0000000..0c3677e
--- /dev/null
+++ b/llm_evals/README.md
@@ -0,0 +1,49 @@
+# LLM Evaluations for Combinatorics Datasets
+
+This directory contains evaluation scripts for testing language models on the algebraic combinatorics datasets using the inspect-ai framework. The evaluations consist of an in-context learning task and a program synthesis task.
+
+## Overview
+
+The evaluation suite provides two main tasks:
+
+1. **In-Context Classification** (`algcomb_classification`):
+   - Tests direct classification, ie simple prompt → label evaluation.
+   - Optionally uses few-shot examples (pretty necessary to get good results) and chain-of-thought.
+
+2. **Program Synthesis** (`algcomb_program_synthesis`):
+   - The language model is given the task of generating a Python function to solve the classification task.
+   - Evaluates generated code in a sandboxed Docker environment (gives the LM the option to use numpy, sympy, and sage-- see Dockerfile).
+   - Optionally uses few-shot examples and chain-of-thought.
+
+## Usage Examples
+
+### In-Context Classification 
+On the weaving patterns dataset, with chain-of-thought reasoning and 25 few-shot examples:
+```
+inspect eval llm_evaluation.py@algcomb_classification -T --dataset=weaving -T --n=6 -T --few_shot_count=25 -T --use_chain_of_thought=true --cache-prompt=true --model=openai/gpt-4
+```
+
+### Program Synthesis
+On the weaving patterns dataset, with chain-of-thought reasoning and 25 few-shot examples:
+```
+inspect eval llm_evaluation.py@algcomb_program_synthesis -T --dataset=weaving -T --n=6 -T --model=anthropic/claude-2 --cache-prompt=true --few_shot_count=25 --use_chain_of_thought=true
+```
+
+#### Notes:
+- You can control the number of programs generated in two ways: either by varying the number of epochs or by varying the number of test samples. Either way, the solver will evaluate the program on the entire test set (this is cheap because it is just a python program!). The difference is that varying the test samples will prompt the model with a different test sample for each program call. However, the scoring is a max over the epochs, so the scoring displayed when you run `inspect view --log-dir logs` in you command line will only be correct if you run with max_test_samples=1 (the default) and varying the number of epochs. You can recover the correct scoring when you run with different test samples by just examining the logs (either sorting by sample when running `inspect view` or by parsing the csv). Hopefully, this will be fixed in the future.
+- For the o1-series of models, you will probably want to increase max_tokens.
+
+## Task Options
+
+- Few-shot learning support (configurable number of training examples)
+- Chain-of-thought prompting
+- Sample size for testing
+- (Very high-level) dataset information included in the prompts
+- Provider-level caching (very useful when using a large number of in-context examples).
+
+## Requirements
+
+- inspect-ai (https://inspect.ai-safety-institute.org.uk/)
+- Docker (for program synthesis evaluation)
+- Access keys for LLM providers (e.g., OpenAI, Anthropic)
+- The downloaded datasets (see load_datasets.py and how_to_load_datasets.ipynb)
\ No newline at end of file
diff --git a/llm_evals/compose.yaml b/llm_evals/compose.yaml
new file mode 100644
index 0000000..3d01783
--- /dev/null
+++ b/llm_evals/compose.yaml
@@ -0,0 +1,8 @@
+services:
+  default:
+    build: .
+    init: true
+    command: tail -f /dev/null
+    network_mode: none
+    cpus: 1.0
+    mem_limit: 3gb 
\ No newline at end of file
diff --git a/llm_evals/experiment_runner.py b/llm_evals/experiment_runner.py
new file mode 100644
index 0000000..9d1d3ce
--- /dev/null
+++ b/llm_evals/experiment_runner.py
@@ -0,0 +1,88 @@
+"""
+Experiment runner for tasks defined in llm_evaluation.py.
+"""
+
+from inspect_ai import eval
+from llm_evaluation import algcomb_classification, algcomb_program_synthesis
+import os
+from plot_per_dataset_n import parse_inspect_logs_from_dir
+
+
+# Define your configuration values below:
+dataset_n_values = {
+    "schubert": [3, 4, 5],
+    # "weaving": [6, 7],
+}
+few_shot_counts = (0, 10, 30, 50, 100)
+models = ["openai/gpt-4o-mini", "openai/gpt-4o", "anthropic/claude-3-5-sonnet-latest", "openai/o1-mini"]
+# solver = "algcomb_classification"
+solver = "algcomb_program_synthesis"
+use_chain_of_thought = [True, False]
+
+tasks_to_eval = []
+
+def was_evaluated(
+    dataset: str,
+    n_val: int,
+    few_shot: int,
+    model: str,
+    chain_of_thought: bool,
+    log_dir: str
+) -> bool:
+    """
+    Returns True if logs in log_dir contain a record matching the given
+    (dataset, n, few_shot, model, use_chain_of_thought).
+    """
+    if not os.path.isdir(log_dir):
+        return False
+
+    df = parse_inspect_logs_from_dir(log_dir)
+    subset = df[
+        (df["dataset"] == dataset)
+        & (df["n"] == n_val)
+        & (df["few_shot"] == few_shot)
+        & (df["model"] == model)
+        & (df["use_chain_of_thought"] == chain_of_thought)
+    ]
+    return not subset.empty
+
+skip_if_existing = True 
+log_dir_base = "schubert_logs_algcomb_program_synthesis/"
+
+for dataset, n_list in dataset_n_values.items():
+    for model in models:
+        for n in n_list:
+            for few_shot in few_shot_counts:
+                for cot in use_chain_of_thought:
+                    if skip_if_existing:
+                        if was_evaluated(dataset, n, few_shot, model, cot, log_dir=f"{dataset}_logs_{solver}"):
+                            print("Skipping this configuration.")
+                            continue
+
+                    if solver == "algcomb_classification":
+                        t = algcomb_classification(
+                            dataset=dataset,
+                            n=n,
+                            few_shot_count=few_shot,
+                            include_dataset_info=True,
+                            max_samples=200,
+                            use_chain_of_thought=cot
+                        )
+                    elif solver == "algcomb_program_synthesis":
+                        t = algcomb_program_synthesis(
+                            dataset=dataset,
+                            n=n,
+                            few_shot_count=few_shot,
+                            include_dataset_info=True,
+                            use_chain_of_thought=cot,
+                            epochs=100,
+                            max_tokens=15000  # this need to be high for the o1 models because they use a lot of tokens for this task
+                        )
+                    else:
+                        raise ValueError(f"Solver '{solver}' not recognized.")
+
+                    tasks_to_eval.append(t)
+
+        logs = eval(tasks_to_eval, model=model, log_dir=f"{dataset}_logs_{solver}", log_level="warning")
+
+print("Evaluation complete. Logs returned:", logs)
\ No newline at end of file
diff --git a/llm_evals/llm_evaluation.py b/llm_evals/llm_evaluation.py
new file mode 100644
index 0000000..349f4f7
--- /dev/null
+++ b/llm_evals/llm_evaluation.py
@@ -0,0 +1,348 @@
+"""
+Inspect script that includes tasks for ALL of the datasets described in how_to_load_datasets.ipynb
+and load_datasets.py. Each task loads the specified dataset (train/test), then constructs
+a small, human-readable Inspect dataset from the test set and evaluates it via a
+simple prompt → label approach, with provider-level caching turned on.
+
+Now, each task includes optional arguments:
+ - few_shot_count (int): If > 0, uses that many few-shot examples from training data.
+ - use_chain_of_thought (bool): If True, inserts a meta-instruction requesting step-by-step reasoning.
+ - max_samples (int): If > 0, uses that many samples from the test set. Otherwise, uses all of the test set.
+ - include_dataset_info (bool): If True, includes dataset information in the prompt. This is very high-level (see load_datasets.py)
+
+Usage example for in-context classification, using both few-shot and chain-of-thought:
+   inspect eval llm_evaluation.py@algcomb_classification -T --dataset=weaving -T --n=6 -T --few_shot_count=25 -T --use_chain_of_thought=true --cache-prompt=true --model=openai/gpt-4 
+
+Usage example for program synthesis, using both few-shot and chain-of-thought:
+   inspect eval llm_evaluation.py@algcomb_program_synthesis -T --dataset=weaving -T --n=6 -T --model=anthropic/claude-2 --cache-prompt=true --few_shot_count=25 --use_chain_of_thought=true
+"""
+
+
+import json
+import sys
+sys.path.append('..')
+from load_datasets import get_dataset
+
+from inspect_ai.scorer import scorer
+from random import sample
+
+from inspect_ai import Epochs, Task, task
+from inspect_ai.dataset import MemoryDataset, Sample
+from inspect_ai.solver import TaskState, chain, prompt_template, generate, chain_of_thought, system_message
+from inspect_ai.scorer import Score, Target, match, mean, stderr
+
+def build_inspect_dataset(X, y, max_samples=-1):
+    """
+    Converts arrays X (features) and y (labels) into an Inspect MemoryDataset.
+    Each item becomes a Sample with 'input' and 'target'.
+
+    max_samples limits how many samples to demonstrate.
+    For more details, see:
+      https://inspect.ai-safety-institute.org.uk/datasets.html
+    """
+    samples = []
+    if max_samples == -1:
+        max_samples = len(X)
+    N = min(len(X), max_samples)
+    for i in range(N):
+        xi_str = " ".join(map(str, X[i]))
+        yi_str = str(y[i])
+
+        prompt_text = (
+            "\n\nHere is an input sequence:\n"
+            f"{xi_str}\n"
+            "Which label best applies?"
+        )
+
+        samples.append(
+            Sample(
+                input=prompt_text,
+                target=yi_str
+            )
+        )
+    return MemoryDataset(samples)
+
+
+def build_solver_chain(
+    few_shot_examples_str: str,
+    use_chain_thought: bool
+):
+    """
+    Helper function to build a solver chain with optional chain-of-thought.
+    few_shot_examples_str is either "" (empty) or a few-shot prompt block.
+    use_chain_thought indicates if chain_of_thought() should be inserted.
+    """
+    if few_shot_examples_str:
+        prompt_str = (
+            f"{few_shot_examples_str}\n"
+        )
+    else:
+        # no few-shot examples
+        prompt_str = ""
+
+    steps = [prompt_template(prompt_str)]
+    if use_chain_thought:
+        steps.append(chain_of_thought())
+    else:
+        prompt_str = "{prompt} Do not provide any additional reasoning or explanation. Just provide your answer at the end on its own line in the form 'ANSWER: $ANSWER' (without quotes) where $ANSWER is the answer to the question."
+        steps.append(prompt_template(prompt_str))
+
+    steps.append(generate(config={"cache-prompt": True}))
+    return chain(*steps)
+
+
+def build_few_shot_examples_str(X_train, y_train, few_shot_count: int):
+    """
+    Builds a few-shot examples string from the train set.
+    """
+    if few_shot_count <= 0:
+        return ""
+    selected_indices = sample(range(len(X_train)), min(few_shot_count, len(X_train)))
+    examples_str_list = []
+    for idx in selected_indices:
+        inp_str = " ".join(str(val) for val in X_train[idx])
+        out_str = str(y_train[idx])
+        examples_str_list.append(f"Example:\nInput: {inp_str}\nOutput: {out_str}\n")
+    return "\n".join(examples_str_list)
+
+
+@task
+def algcomb_classification(
+    dataset: str = "schubert",
+    n: int = 6,
+    folder: str = "../",
+    few_shot_count: int = 0,
+    max_samples: int = -1,
+    use_chain_of_thought: bool = False,
+    include_dataset_info: bool = True
+):
+    """
+    A single task that evaluates a LM on one of the ML4AlgComb datasets.
+
+    Args:
+        dataset (str): Must be either "weaving", "rsk", "schubert", "quiver", "mheight", "symmetric_group_char", "grassmannian_cluster_algebras", "kl_polynomial", or "lattice_path"
+        n (int): 
+            - n = 6, 7, or 8 for "weaving"
+            - n = 8, 9, or 10 for "rsk"
+            - n = 3, 4, 5, or 6 for "schubert"
+            - n = 10, 11, or 12 for "mheight"
+            - n = 18, 20, 22 for "symmetric_group_char"
+            - n = 8 or 9 for "kl_polynomial"
+            - n = 10, 11, 12, or 13 for "lattice_path"
+            - There are not multiple values of n for the "quiver" and "grassmannian_cluster_algebras" datasetes
+        folder (str): path to the dataset files.
+        few_shot_count (int): Number of few-shot training examples (0 = none).
+        use_chain_of_thought (bool): Use chain-of-thought meta-prompt.
+        max_samples (int): # of samples from test set.
+        include_dataset_info (bool): Include dataset information in the prompt.
+    """
+    allowed_datasets = [
+        "weaving",
+        # "kl_polynomial",
+        "schubert",
+        "rsk",
+        "lattice_path",
+        "mheight",
+        "quiver",
+        "symmetric_group_char",
+    ]
+    if dataset not in allowed_datasets:
+        raise ValueError(
+            f"Invalid dataset: '{dataset}'. Must be one of: {', '.join(allowed_datasets)}"
+        )
+
+    result = get_dataset(data=dataset, n=n, folder=folder, info_str=include_dataset_info)
+    if include_dataset_info:
+        (X_train, y_train, X_test, y_test, input_size, output_size, num_tokens), dataset_info = result
+    else:
+        (X_train, y_train, X_test, y_test, input_size, output_size, num_tokens) = result
+        dataset_info = ""
+
+    ds = build_inspect_dataset(X_test, y_test, max_samples=max_samples)
+
+    few_shot_str = build_few_shot_examples_str(X_train, y_train, few_shot_count)
+    prompt_with_info = (
+        (f"You are tasked with solving a classification problem. Here is high-level information about the dataset:\n{dataset_info}\n\n" if dataset_info else "") +
+        f"{few_shot_str}"
+        "{prompt}"
+    )
+    solver_chain = build_solver_chain(prompt_with_info, use_chain_of_thought)
+
+    return Task(dataset=ds, solver=solver_chain, scorer=match())
+
+
+@task
+def algcomb_program_synthesis(
+    dataset: str = "schubert",
+    n: int = 6,
+    folder: str = "../",
+    few_shot_count: int = 0,
+    use_chain_of_thought: bool = False,
+    max_samples: int = 1,
+    max_test_samples: int = -1,
+    epochs: int = 1,
+    timeout: int = 30,
+    max_tokens: int = 10000,
+    include_dataset_info: bool = True,
+):
+    """
+    Program synthesis solver that works for all ML4AlgComb datasets.
+    In each epoch, the model attempts to generate a single Python function 
+    that solves the classification problem. We then score the program by 
+    running it within a sandboxed python() environment on the test samples.
+
+    See the README for more details. Requires Docker and the Dockerfile in the same directory as this script.
+
+    Args:
+        dataset (str): Must be one of: "weaving", "rsk", "schubert", "quiver", 
+                      "mheight", "symmetric_group_char", "grassmannian_cluster_algebras", 
+                      "kl_polynomial", or "lattice_path"
+        n (int): Dataset size parameter (varies per dataset).
+        folder (str): Location of dataset files.
+        few_shot_count (int): If >0, number of training examples to show inline.
+        use_chain_of_thought (bool): Whether or not to request chain-of-thought.
+        max_samples (int): -1 => no limit, else cap the test set size.
+        epochs (int): Number of times to re-run the generation process.
+        timeout (int): Timeout for the program call.
+        include_dataset_info (bool): Whether to include dataset information in the prompt.
+    """
+    allowed_datasets = [
+        "kl_polynomial",
+        "schubert",
+        "rsk",
+        "lattice_path",
+        "mheight",
+        "quiver",
+        "symmetric_group_char",
+        "weaving"
+    ]
+    if dataset not in allowed_datasets:
+        raise ValueError(
+            f"Invalid dataset: '{dataset}'. Must be one of: {', '.join(allowed_datasets)}"
+        )
+
+    result = get_dataset(data=dataset, n=n, folder=folder, info_str=include_dataset_info)
+    if include_dataset_info:
+        (X_train, y_train, X_test, y_test, input_size, output_size, num_tokens), dataset_info = result
+    else:
+        (X_train, y_train, X_test, y_test, input_size, output_size, num_tokens) = result
+        dataset_info = ""
+
+    if max_test_samples > 0:
+        X_test = X_test[:max_test_samples]
+        y_test = y_test[:max_test_samples]
+
+    training_examples = ""
+    few_shot_indices = range(min(few_shot_count, len(X_train)))
+    for i in few_shot_indices:
+        training_examples += (
+            f"\n# Input: {X_train[i]},  Expected Output: {y_train[i]}"
+        )
+
+    instructions = (
+        "Before answering with your Python code, reason in a step-by-step manner as to get the right answer.\n\n"
+        if use_chain_of_thought
+        else ""
+    )
+
+    system_msg = f"""Your job is to write a Python function that solves the classification problem. 
+You will be given some examples of a classification problem from the '{dataset}' dataset.
+Write a function 'predict' that takes an input in a Python list and returns an integer  as the classification result.
+
+Here is information about the dataset:
+{dataset_info}
+
+Avoid using machine learning or model calls; rather, embed the logic in Python code.
+Rather than use shallow pattern matching or using simple patterns, try to analyze the underlying combinatorial logic of the examples. Note that the datagenerating process for this dataset is a combinatorial algorithm.
+You may want to use numpy and sympy for math operations or sage for cominatorics, however this is optional. If you do use them, *make sure to import them within your function*.
+
+Below are a few examples from the training set:
+{training_examples}
+
+{instructions}
+Your final answer should be valid Python code enclosed in triple backticks. This program will be evaluated on the test set.
+"""
+
+
+    @scorer(metrics=[mean(), stderr()])
+    def program_synthesis_scorer():
+        async def score(state: TaskState, target: Target) -> Score:
+            import re
+
+            completion = state.output.completion or ""
+            pattern = r"```(?:python)?\n(.*?)```"
+            match = re.search(pattern, completion, re.DOTALL)
+            function_body = match.group(1) if match else None
+
+            if not function_body:
+                # If we can't find code, mark as 0.0
+                return Score(value=0.0, explanation="No code block found")
+
+            # Insert a test harness that: 
+            # 1) defines the predict(...) function from user code
+            # 2) runs predict(...) on all X_test
+            # 3) returns predictions as JSON (so we can parse them in python)
+            python_test_code = f"""
+import json
+
+{function_body}
+
+test_inputs = {X_test.tolist()}
+preds = []
+for x in test_inputs:
+    preds.append(predict(x))
+
+print(json.dumps(preds))
+"""
+
+            try:
+                from inspect_ai.tool import ToolError
+                from inspect_ai.tool import python as py_tool
+
+                result = await py_tool(timeout=timeout)(code=python_test_code)
+                try:
+                    preds = json.loads(result)
+                except json.JSONDecodeError:
+                    return Score(value=0.0, explanation=f"Failed to parse JSON output: {result}")
+                if len(preds) != len(y_test):
+                    return Score(
+                        value=0.0,
+                        explanation="Returned predictions have wrong length",
+                    )
+
+                correct_count = sum(1 for p, gold in zip(preds, y_test) if p == gold)
+                acc = correct_count / len(y_test)
+                return Score(
+                    value=acc,
+                    answer=str(preds),
+                    explanation=f"Ran program in sandbox. Acc={acc:.3f}, {correct_count} correct out of {len(y_test)}"
+                )
+            except ToolError as te:
+                return Score(value=0.0, explanation=f"ToolError: {te}")
+            except Exception as ex:
+                return Score(value=0.0, explanation=f"Exception: {ex}")
+
+        return score
+
+    samples = []
+    for x, y in zip(X_train[:max_samples], y_train[:max_samples]):
+        samples.append(Sample(input="", target=str(y)))
+
+    ds = MemoryDataset(samples)
+
+    solver_chain = [
+        system_message(system_msg),
+        generate(
+            max_tokens=max_tokens,
+            stop=["```", "def predict", "# End of code"],
+            config={"cache-prompt": True}
+        ),
+    ]
+
+    return Task(
+        dataset=ds,
+        solver=solver_chain,
+        scorer=program_synthesis_scorer(),
+        sandbox="docker",
+        epochs=Epochs(epochs=epochs, reducer="max")
+    )
diff --git a/llm_evals/plot_per_dataset_n.py b/llm_evals/plot_per_dataset_n.py
new file mode 100644
index 0000000..53b7cb0
--- /dev/null
+++ b/llm_evals/plot_per_dataset_n.py
@@ -0,0 +1,190 @@
+"""
+Creates one plot per (dataset, n) pair from Inspect evaluation logs produced by
+the tasks in llm_evaluation.py (algcomb_classification, algcomb_program_synthesis).
+For each dataset-n pair, the plot shows accuracy vs. few-shot count, colored by model.
+
+Example Usage:
+    python plot_per_dataset_n.py --logdir logs_dir
+
+Where logs_dir is a directory containing one or more Inspect log files (*.json)
+generated by, e.g.:
+    inspect eval experiment_runner.py --json > logs_part1.json
+    inspect eval llm_evaluation.py@algcomb_classification --json > logs_part2.json
+(and so on). Just place them all in logs_dir, and they will be combined.
+
+This script uses `read_eval_log` from `inspect_ai.log` instead of raw JSON.
+"""
+
+import os
+import argparse
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+from inspect_ai.log import read_eval_log
+
+sns.set_style("whitegrid")
+
+def parse_inspect_log_file(log_path):
+    """
+    Reads a single Inspect log file via read_eval_log and extracts records for each
+    task (or single task). Returns a list of dictionaries with:
+      {
+          "dataset": dataset_name,
+          "n": n_value,
+          "few_shot": few_shot_count,
+          "model": model,
+          "accuracy": final_score
+      }
+
+    We attempt to locate 'accuracy' or 'score' or 'mean' in the log's metrics 
+    to use as the final score.
+    """
+    records = []
+    log_file = read_eval_log(log_path)
+    if not log_file or not log_file.results:
+        return records
+
+    # Identify the top-level model name, if any, from the log
+    # (Because a multi-task run might have used the same model across tasks)
+    top_level_model = getattr(log_file.eval, "model", "unknown_model")
+
+    # If it's a multi-task log, we might have multiple tasks in log_file.results.task_results
+    # If not multi-task, we might just have log_file.results.metrics, etc.
+    task_results = getattr(log_file.results, "task_results", None)
+
+    def extract_final_score(metrics_dict):
+        """Helper to find a final score-like metric (accuracy, score, or mean)."""
+        for possible_key in ["accuracy", "score", "mean"]:
+            if possible_key in metrics_dict:
+                return metrics_dict[possible_key].value
+        # If none of these are found, return None
+        return None
+
+    if task_results:
+        # Multi-task scenario
+        for tresult in task_results:
+            params = getattr(tresult, "task_args", {})
+            dataset = params.get("dataset", "unknown_dataset")
+            n_value = params.get("n", -1)
+            few_shot = params.get("few_shot_count", 0)
+            # Fall back to top_level_model if not present in tresult
+            model = getattr(log_file.eval, "model", "unknown_model")
+
+            # Attempt to get final score from task-level metrics
+            final_score = extract_final_score(tresult.metrics)
+            # NEW: Capture whether CoT was used
+            cot = params.get("use_chain_of_thought", False)
+
+            records.append({
+                "dataset": dataset,
+                "n": n_value,
+                "few_shot": few_shot,
+                "model": model,
+                "accuracy": final_score,
+                "use_chain_of_thought": cot
+            })
+    else:
+        # Single-task scenario
+        params = getattr(log_file.eval, "task_args", {})
+        dataset = params.get("dataset", "unknown_dataset")
+        n_value = params.get("n", -1)
+        few_shot = params.get("few_shot_count", 0)
+        model = top_level_model
+
+        # Attempt to get final score from top-level metrics
+        final_score = extract_final_score(log_file.results.metrics)
+        # NEW: Capture whether CoT was used
+        cot = params.get("use_chain_of_thought", False)
+
+        records.append({
+            "dataset": dataset,
+            "n": n_value,
+            "few_shot": few_shot,
+            "model": model,
+            "accuracy": final_score,
+            "use_chain_of_thought": cot
+        })
+
+    return records
+
+
+def parse_inspect_logs_from_dir(log_dir):
+    """
+    Reads all .json files in log_dir, parses them using parse_inspect_log_file,
+    and returns a combined DataFrame with columns: dataset, n, few_shot, model, accuracy.
+    """
+    all_records = []
+    for file_name in os.listdir(log_dir):
+        if file_name.endswith(".json") or file_name.endswith(".eval"):
+            file_path = os.path.join(log_dir, file_name)
+            new_records = parse_inspect_log_file(file_path)
+            all_records.extend(new_records)
+
+    df = pd.DataFrame(all_records)
+    return df
+
+
+def main():
+    """
+    Reads Inspect log files in a directory, combines them into a DataFrame, then generates
+    a per-(dataset, n) line plot of accuracy vs. few-shot count, colored by model.
+    """
+    parser = argparse.ArgumentParser(description="Plots Inspect logs (accuracy vs. few_shot) using read_eval_log.")
+    parser.add_argument("--logdir", type=str, default="logs_dir", help="Directory containing the JSON logs from Inspect.")
+    parser.add_argument("--task_name", type=str, default="n-shot classification", help="Name of the task to plot.")
+    args = parser.parse_args()
+
+    # Parse all logs in the directory
+    df = parse_inspect_logs_from_dir(args.logdir)
+
+    # If no logs found, exit gracefully
+    if df.empty:
+        print(f"No .json logs found in '{args.logdir}' or no metrics data extracted.")
+        return
+
+    # Sort by few_shot for plotting convenience
+    df = df.sort_values("few_shot")
+
+    # For each dataset, and each value of n, create and save a plot
+    for dataset in df["dataset"].unique():
+        ds_subset = df[df["dataset"] == dataset]
+        for n_val in ds_subset["n"].unique():
+            subset = ds_subset[ds_subset["n"] == n_val]
+
+            if subset.empty:
+                continue
+
+            # Group by few_shot and model, and get the max accuracy for each group
+            subset = subset.groupby(['few_shot', 'model', 'use_chain_of_thought'])['accuracy'].max().reset_index()
+
+            # Create the plot
+            plt.figure(figsize=(6, 4))
+            sns.lineplot(
+                data=subset,
+                x="few_shot",
+                y="accuracy",
+                hue="model",
+                style="use_chain_of_thought",
+                markers=True,
+                dashes=False,
+                errorbar=None
+            )
+            plt.title(f"Dataset = {dataset}, n = {n_val}, task name = {args.task_name}")
+            plt.xlabel("Few-Shot Examples")
+            plt.ylabel("Accuracy")
+            plt.ylim(0, 1)  # classification accuracies or normalized scores typically in [0, 1]
+            plt.legend(loc="best")
+            plt.tight_layout()
+
+            # Save as PNG
+            out_filename = f"{dataset}_n={n_val}_task_name={args.task_name}.png"
+            plt.savefig(out_filename)
+            plt.close()
+            print(f"Saved plot: {out_filename}")
+
+    print("Done! Plots created for each (dataset, n) pair.")
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/load_datasets.py b/load_datasets.py
index b0747bc..5875966 100644
--- a/load_datasets.py
+++ b/load_datasets.py
@@ -1,13 +1,11 @@
-import torch
 import os
 import random
-import pickle as pkl
 import numpy as np
 import math
 import ast
 from typing import Optional
 
-def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
+def get_dataset(data: str, n: Optional[int] = None, folder = "./", info_str: bool = False):
     """
     Parameters:
     ----------
@@ -22,10 +20,12 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         - n = 10, 11, 12, or 13 for "lattice_path"
         - There are not multiple values of n for the "quiver" and "grassmannian_cluster_algebras" datasetes
     folder (str, optional): Base directory for dataset files. Defaults to "./".
+    info_str (bool, optional): Also return a string with information about the dataset. Defaults to False.
 
     Returns:
     --------
     tuple: A tuple containing the following elements: X_train (np.array), y_train (np.array), X_test (np.array), y_test (np.array), input_size (int), output_size (int), num_tokens (int)
+    info_str (str, optional): A string with information about the dataset.
     """
 
     if data == "weaving":
@@ -54,7 +54,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Inputs are sequences of length {input_size} with entries between 0 and {num_tokens-1}, representing weaving patterns.")
         print(f"There are {output_size} classes. Weaving patterns are labeled 1, non-weaving patterns are labeled 0.")
-        return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInputs are sequences of length {input_size} with entries between 0 and {num_tokens-1}, representing weaving patterns.\nThere are {output_size} classes. Weaving patterns are labeled 1, non-weaving patterns are labeled 0."
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens)
 
     elif data == "rsk":
         assert n in {8, 9, 10}, f"Can't handle n={n}. n must be 8, 9, or 10."
@@ -88,7 +92,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Input sequence is length {max_input_length} with entries 0 through {num_tokens-1}, representing two concatenated SSYT, padded so that all inputs have the same length.")
         print(f"Outputs are binary sequences of length {len(y_train[0])}. Output is one permutation represented by its inversion sequence.")
-        return np.array(X_train_padded), np.array(y_train), np.array(X_test_padded), np.array(y_test), max_input_length, output_size, num_tokens
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInput sequence is length {max_input_length} with entries 0 through {num_tokens-1}, representing two concatenated SSYT, padded so that all inputs have the same length.\nOutputs are binary sequences of length {len(y_train[0])}. Output is one permutation represented by its inversion sequence."
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), max_input_length, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train_padded), np.array(y_train), np.array(X_test_padded), np.array(y_test), max_input_length, output_size, num_tokens)
 
     elif data == "schubert":
         assert n in {3, 4, 5, 6}, f"Can't handle n={n}. n must be 3, 4, 5, or 6."
@@ -121,7 +129,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Inputs are sequences of length {input_size}, which represent three concatenated permutations on the letters 1 through {num_tokens-1}.")
         print(f"There are {output_size} classes, which give the structure constant for the input permutations.")
-        return (np.array(X_train_flattened), np.array(y_train), np.array(X_test_flattened), np.array(y_test), input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInputs are sequences of length {input_size}, which represent three concatenated permutations on the letters 1 through {num_tokens-1}.\nThere are {output_size} classes, which give the structure constant for the input permutations."
+            return (np.array(X_train_flattened), np.array(y_train), np.array(X_test_flattened), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train_flattened), np.array(y_train), np.array(X_test_flattened), np.array(y_test), input_size, output_size, num_tokens)
 
 
     elif data == "symmetric_group_char":
@@ -156,8 +168,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Inputs are sequences of length {input_size} with entries 0 through {num_tokens-1}, which represent two concatenated integer partitions of n={n}.")
         print(f"There are {output_size} classes for n={n}.")
-        
-        return (X_train.reshape(X_train.shape[0], -1), y_train, X_test.reshape(X_test.shape[0], -1), y_test, input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInputs are sequences of length {input_size} with entries 0 through {num_tokens-1}, which represent two concatenated integer partitions of n={n}.\nThere are {output_size} classes for n={n}."
+            return (np.array(X_train.reshape(X_train.shape[0], -1)), np.array(y_train), np.array(X_test.reshape(X_test.shape[0], -1)), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train.reshape(X_train.shape[0], -1)), np.array(y_train), np.array(X_test.reshape(X_test.shape[0], -1)), np.array(y_test), input_size, output_size, num_tokens)
 
     elif data == "quiver":
         path_to_files = os.path.join(folder, "./cluster_algebra_quivers/")
@@ -180,7 +195,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Input sequences of length {input_size} are flattened adjacency matrices with entries 0 through {num_tokens-1}")
         print(f"There are {output_size} classes: A_11: 0, BD_11: 1, D_11: 2, BE_11: 3, BB_11: 4, E_11: 5, DE_11: 6")
-        return (X_train, np.array(y_train), X_test, np.array(y_test), input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInput sequences of length {input_size} are flattened adjacency matrices with entries 0 through {num_tokens-1}\nThere are {output_size} classes: A_11: 0, BD_11: 1, D_11: 2, BE_11: 3, BB_11: 4, E_11: 5, DE_11: 6"
+            return (np.array(X_train.reshape(X_train.shape[0], -1)), np.array(y_train), np.array(X_test.reshape(X_test.shape[0], -1)), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train.reshape(X_train.shape[0], -1)), np.array(y_train), np.array(X_test.reshape(X_test.shape[0], -1)), np.array(y_test), input_size, output_size, num_tokens)
 
     elif data == "mheight":
         assert n in {8, 9, 10, 11, 12}, f"Can't handle n={n}. n must be 8, 9, 10, 11 or 12."
@@ -203,7 +222,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Input sequences are permutations represented by their inversion sequence, which is a binary sequence of length ({n} choose 2)= {input_size}.")
         print(f"There are {output_size} classes; classes that contained less than 0.01% of the data were filtered.")
-        return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInput sequences are permutations represented by their inversion sequence, which is a binary sequence of length ({n} choose 2)= {input_size}.\nThere are {output_size} classes; classes that contained less than 0.01% of the data were filtered."
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens)
 
 
     elif data == "grassmannian_cluster_algebras":
@@ -241,7 +264,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Inputs are sequences of length {input_size}, with {num_tokens} tokens, which represent 3x4 SSYT")
         print(f"There are {output_size} classes. SSYT that index a valid cluster variable are labeled 1 and SSYT that do not are labeled 0.")
-        return (X_train.reshape(X_train.shape[0], -1), y_train, X_test.reshape(X_test.shape[0], -1), y_test, input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInputs are sequences of length {input_size}, with {num_tokens} tokens, which represent 3x4 SSYT\nThere are {output_size} classes. SSYT that index a valid cluster variable are labeled 1 and SSYT that do not are labeled 0."
+            return (np.array(X_train.reshape(X_train.shape[0], -1)), np.array(y_train), np.array(X_test.reshape(X_test.shape[0], -1)), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train.reshape(X_train.shape[0], -1)), np.array(y_train), np.array(X_test.reshape(X_test.shape[0], -1)), np.array(y_test), input_size, output_size, num_tokens)
 
     elif data == "kl_polynomial":
         assert n in {8, 9}, f"Can't handle n={n}. n must be 8, 9, or 10."
@@ -263,7 +290,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Inputs are sequences of length {input_size}, representing two permutations on the letters 0 through {num_tokens-1}")
         print(f"There are {output_size} classes, which each represent the fifth coefficient in the polynomial.")
-        return (X_train, y_train, X_test, y_test, input_size, output_size, num_tokens)
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInputs are sequences of length {input_size}, representing two permutations on the letters 0 through {num_tokens-1}\nThere are {output_size} classes, which each represent the fifth coefficient in the polynomial."
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens)
 
     elif data == "lattice_path":
         assert n in {10, 11, 12, 13}, f"Can't handle {n}"
@@ -292,8 +323,11 @@ def get_dataset(data: str, n: Optional[int] = None, folder = "./"):
         print(f"Test set has {len(X_test)} examples")
         print(f"Inputs are two concatenated binary sequences represented a lattice path and its cover. The input for n={n} is length {input_size}.")
         print(f"There are {output_size} classes. Lagrange covers are labeled 0, matching covers are labeled 1.")
-        
-        return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens 
+        if info_str:
+            info_str = f"Train set has {len(X_train)} examples\nTest set has {len(X_test)} examples\nInputs are two concatenated binary sequences represented a lattice path and its cover. The input for n={n} is length {input_size}.\nThere are {output_size} classes. Lagrange covers are labeled 0, matching covers are labeled 1."
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens), info_str
+        else:
+            return (np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), input_size, output_size, num_tokens)
 
     else:
         raise NotImplementedError(f'No {data}. Supported options are "weaving", "rsk", "schubert", "quiver", "mheight", "symmetric_group_char", "grassmannian_cluster_algebras", "kl_polynomial", or "lattice_path".')