Skip to content

Commit

Permalink
Add a step to assert that the evaluation loss is reasonable
Browse files Browse the repository at this point in the history
  • Loading branch information
mwaskom committed Feb 7, 2024
1 parent d57e2b6 commit e2a56be
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 2 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/ci-cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
- name: Install Modal
run: |
python -m pip install --upgrade pip
pip install modal pyyaml
pip install modal pyyaml pandas
- name: Prep config and data for CI
run: |
Expand All @@ -39,3 +39,7 @@ jobs:
- name: Run training job on Modal
run: |
modal run src.train --config=config/${{ matrix.config }}.yml --data=data/sqlqa.jsonl
- name: Check training results
run: |
python ci/check_loss.py
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,7 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


# Local file written by the training script
.last_run_folder
31 changes: 31 additions & 0 deletions ci/check_loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from io import StringIO
import re
import sys

import pandas as pd

from modal import Volume


if __name__ == "__main__":

with open(".last_run_folder", "r") as f:
run_folder = f.read().strip()

vol = Volume.lookup("example-runs-vol")
contents = b""
for chunk in vol.read_file(f"{run_folder}/lora-out/README.md"):
contents += chunk

m = re.search(r"### Training results\n\n(.+?)#", contents.decode(), flags=re.DOTALL)
if m is None:
sys.exit("Could not parse training results from model card")
else:
results_text = m.group().replace(" ", "")

results = pd.read_table(StringIO(results_text), sep="|")
train_loss = results["TrainingLoss"].iloc[-1].astype(float)
val_loss = results["ValidationLoss"].iloc[-1].astype(float)

print("Loss: {train_loss:.2f} (training), {val_loss:.2f} (validation)")
sys.exit(val_loss < 0.25) # Arbitrary threshold
6 changes: 5 additions & 1 deletion src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,11 @@ def main(
):
# Read config.yml and my_data.jsonl and pass them to the new function.
with open(config, "r") as cfg, open(data, "r") as dat:
_, train_handle = launch.remote(cfg.read(), dat.read())
run_folder, train_handle = launch.remote(cfg.read(), dat.read())

# Write a local refernce to the location on the remote volume with the run
with open(".last_run_folder", "w") as f:
f.write(run_folder)

# Wait for the training run to finish.
merge_handle = train_handle.get()
Expand Down

0 comments on commit e2a56be

Please sign in to comment.