allenai · chaitanyamalaviya · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,15 @@
 # Changelog
 
+
 All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## Unreleased
+
+### Added
+
+- Added data: queries used for experiments, as well as, human and autorater judgments.
+- Added code for running all experiments in the paper.
+
diff --git a/contexteval/README.md b/contexteval/README.md
@@ -0,0 +1,86 @@
+## *Contextualized Evaluations*: Taking the Guesswork Out of Language Model Evaluations
+
+
+## Data Overview
+
+* The queries used for our main experiments are sampled from 5 datasets and can be found at `data/all_data_latest_filtered.jsonl`.
+
+* The autorater judgements for all three model pairs are available at `data/autorater_judgements`.
+
+* The human judgements for all three model pairs are available at `data/human_judgements`.
+
+
+## Code Overview
+
+#### Data Downloads
+1. Download all the queries from all 5 datasets (`ChatBot Arena`, `MTBench`, `AlpacaEval`, `ExpertQA`, `KIWI`).
+   * `python3 main/download_data.py`
+2. Filter only well-formed queries and sample a fixed number of queries from each dataset.
+   * `python3 main/filter_queries.py`
+
+
+#### Context Generation
+
+Example scripts for running context generation are at `bash_scripts/run_context_generation.sh`.
+
+1. Generate contexts for all queries.
+   * `python3 main/generate_contexts.py`
+
+2. Generate validation labels for generated contexts.
+   * `python3 main/generate_context_validation.py`
+
+3. Generate a single instance of context for each query (follow-up question with single answer).
+   * `python3 main/generate_single_context.py`
+
+
+#### Response Generation
+
+* Example scripts for running response generation are at `bash_scripts/run_response_generation.sh`.
+
+* Generate model responses with and without context.
+   * `python3 main/generate_responses.py`. Use `--w_context=False` for context-agnostic generation and `--w_context=True` for context-aware generation.
+
+#### Autorater Evaluation Judgements
+
+* Example scripts for generating pairwise evaluation judgements are at `bash_scripts/run_eval_generation.sh`. 
+    * For the setting `CtxGen-CtxEval`, you would need to provide responses that are generated with context and for `NoCtxGen-NoCtxEval` and `NoCtxGen-CtxEval`, you would need to provide responses generated without context. For `CtxGen-CtxEval` and `NoCtxGen-CtxEval`, you would set `W_CONTEXT=True` while you would set `W_CONTEXT=False` for `NoCtxGen-NoCtxEval`.
+
+* Generate pairwise evaluation judgements using the following script.
+   * `python3 main/generate_pairwise_evals.py`
+
+#### Win Rate and Agreement Calculation
+
+* Computing win rates and agreement based on autorater judgments
+   * `python3 main/compute_autorater_agreement.py`.
+
+* Computing win rates and agreement based on human judgments
+   * `python3 main/compute_human_agreement.py`.
+
+#### What Contexts are "Default"?
+
+* Example scripts for running this analysis are at `bash_scripts/default_response_analysis.sh`.
+
+#### Which Contexts are Harder to Follow?
+
+* Example scripts for running this analysis are at `bash_scripts/adapted_response_analysis.sh`.
+
+
+#### Miscellaneous Scripts
+
+* To generate types of each query based on the degree / type of underspecification, use the script `main/generate_query_types.py`.
+* To compute the number of constraints (follow-up QAs) satisfied by each response, use the script `main/eval_num_constraints.py`.
+* To codify autorater justifications, use the script `main/code_model_judgements.py` and to codify human justifications, use the script `main/code_human_judgements.py`.
+
+
+### Citation
+
+```
+@inproceedings{malaviya2024contexteval,
+   author = {Malaviya, Chaitanya and Chee Chang, Joseph and Roth, Dan and Iyyer, Mohit and Yatskar, Mark and Lo, Kyle},
+   title = {Contextualized Evaluations: Taking the Guesswork Out of Language Model Evaluations},
+   journal = {arXiv preprint arXiv:2411.07237},
+   month = {November},
+   year = {2024},
+   url = "https://arxiv.org/abs/2411.07237"
+}
+```
diff --git a/contexteval/common/example_utils.py b/contexteval/common/example_utils.py
@@ -0,0 +1,43 @@
+"""Utilities for reading and writing examples."""
+
+import dataclasses
+from typing import Optional, Sequence
+
+import jsonl_utils
+
+
+@dataclasses.dataclass(frozen=False)
+class Example:
+  """Represents an example for a task."""
+
+  completed: Optional[bool] = False
+  # Input query
+  query: str = None
+  # Query types
+  query_types: str = None
+  # Name of the models from which the output was sampled
+  model_names: Sequence[str] = None
+  # Model responses to the input query
+  model_responses: Sequence[str] = None
+  # Whether the query needs to be supplemented with context
+  need_for_context: Optional[bool] = False
+  # Contexts generated through model
+  contexts: Optional[str] = None
+  # Annotator ID of the person who annotated this example
+  annotator_id: Optional[str] = None
+  # Source of the input query
+  source: Optional[str] = None
+  # Model from which context was sampled
+  context_model_source: Optional[int] = None
+  # Sampled QA pairs from context (single answer for each question)
+  sampled_context: Optional[str] = None
+
+def read_examples(filepath):
+  examples_json = jsonl_utils.read(filepath)
+  examples = [Example(**ex) for ex in examples_json]
+  return examples
+
+
+def write_examples(filepath, examples, append=False):
+  examples_json = [dataclasses.asdict(ex) for ex in examples]
+  jsonl_utils.write(filepath, examples_json, append=append)
diff --git a/contexteval/common/jsonl_utils.py b/contexteval/common/jsonl_utils.py
@@ -0,0 +1,32 @@
+"""Utilities for reading and writing jsonl files."""
+
+import json
+
+def read(filepath, limit=None, verbose=False):
+  """Read jsonl file to a List of Dicts."""
+  data = []
+  with open(filepath, "r") as jsonl_file:
+    for idx, line in enumerate(jsonl_file):
+      if limit is not None and idx >= limit:
+        break
+      if verbose and idx % 100 == 0:
+        # Print the index every 100 lines.
+        print("Processing line %s." % idx)
+      try:
+        data.append(json.loads(line))
+      except json.JSONDecodeError as e:
+        print("Failed to parse line: `%s`" % line)
+        raise e
+  print("Loaded %s lines from %s." % (len(data), filepath))
+  return data
+
+
+def write(filepath, rows, append=False, verbose=True):
+  """Write a List of Dicts to jsonl file."""
+  open_mode = "a" if append else "w"
+  with open(filepath, open_mode) as jsonl_file:
+    for row in rows:
+      line = "%s\n" % json.dumps(row)
+      jsonl_file.write(line)
+  if verbose:
+    print("Wrote %s lines to %s." % (len(rows), filepath))
diff --git a/contexteval/common/tsv_utils.py b/contexteval/common/tsv_utils.py
@@ -0,0 +1,78 @@
+r"""Utilties for reading and writing files.
+
+Expected format for TSV file is that each line has one rows, with each
+element separated by \t. The number of element should be the same as
+expected_num_columns.
+
+Expected format for rows in memory is a list where each element is:
+(element_1, element_2, ...), or [element_1, element_2, ...]
+The number of element should be the same as expected_num_columns.
+
+This module also handles the case of writing simple newline-separated txt files.
+"""
+
+import csv
+import sys
+# from tensorflow.io import gfile
+csv.field_size_limit(sys.maxsize)
+
+
+def read_tsv(filepath, delimiter="\t", max_splits=-1):
+  """Read file to list of rows."""
+  rows = []
+  with open(filepath, "r") as tsv_file:
+    for line in tsv_file:
+      line = line.rstrip()
+      cols = line.split(delimiter, max_splits)
+      rows.append(cols)
+  print("Loaded %s rows from %s." % (len(rows), filepath))
+  return rows
+
+
+def write_tsv(rows, filepath, delimiter="\t"):
+  """Write rows to tsv file."""
+  with open(filepath, "w") as tsv_file:
+    for row in rows:
+      line = "%s\n" % delimiter.join([str(elem) for elem in row])
+      tsv_file.write(line)
+  print("Wrote %s rows to %s." % (len(rows), filepath))
+
+
+def read_csv(filepath):
+  with open(filepath, "r", newline="", encoding="utf-8") as csvfile:
+    reader = csv.reader(
+        csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
+    )
+    rows = list(reader)
+  print("Loaded %s rows from %s." % (len(rows), filepath))
+  return rows
+
+
+def write_csv(rows, filepath):
+  print("Writing %d lines to %s" % (len(rows), filepath))
+  with open(filepath, "w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.writer(
+        csvfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
+    )
+    for row in rows:
+      writer.writerow(row)
+
+
+def write_txt(rows, filepath):
+  """Write newline separated text file."""
+  with open(filepath, "w") as tsv_file:
+    for row in rows:
+      line = "%s\n" % row
+      tsv_file.write(line)
+  print("Wrote %s rows to %s." % (len(rows), filepath))
+
+
+def read_txt(filepath):
+  """Read newline separated text file."""
+  rows = []
+  with open(filepath, "r") as tsv_file:
+    for line in tsv_file:
+      line = line.rstrip()
+      rows.append(line)
+  print("Loaded %s rows from %s." % (len(rows), filepath))
+  return rows