From a8fe0370632c943a34a10045c4877657d9b42f7e Mon Sep 17 00:00:00 2001 From: Kaiyu Yang Date: Sat, 30 Mar 2024 23:30:21 +0000 Subject: [PATCH] running --- generator/confs/cli_lean4_novel_premises.yaml | 7 ++++--- generator/confs/cli_lean4_random.yaml | 7 ++++--- generator/model.py | 13 +++++++++---- prover/proof_search.py | 1 - 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/generator/confs/cli_lean4_novel_premises.yaml b/generator/confs/cli_lean4_novel_premises.yaml index bb3f381..f720237 100644 --- a/generator/confs/cli_lean4_novel_premises.yaml +++ b/generator/confs/cli_lean4_novel_premises.yaml @@ -11,7 +11,7 @@ trainer: cpu_checkpointing: false gradient_clip_val: 1.0 max_steps: 500000 - check_val_every_n_epoch: 2 + check_val_every_n_epoch: 1 num_sanity_val_steps: 0 callbacks: - class_path: pytorch_lightning.callbacks.LearningRateMonitor @@ -26,8 +26,9 @@ model: length_penalty: 0.0 ret_ckpt_path: null eval_num_retrieved: 100 - eval_num_cpus: 12 - eval_num_theorems: 0 + eval_num_workers: 8 + eval_num_gpus: 1 + eval_num_theorems: 400 data: data_path: data/leandojo_benchmark_4/novel_premises/ diff --git a/generator/confs/cli_lean4_random.yaml b/generator/confs/cli_lean4_random.yaml index 1d9d066..565184a 100644 --- a/generator/confs/cli_lean4_random.yaml +++ b/generator/confs/cli_lean4_random.yaml @@ -11,7 +11,7 @@ trainer: cpu_checkpointing: false gradient_clip_val: 1.0 max_steps: 500000 - check_val_every_n_epoch: 2 + check_val_every_n_epoch: 1 num_sanity_val_steps: 0 callbacks: - class_path: pytorch_lightning.callbacks.LearningRateMonitor @@ -26,8 +26,9 @@ model: length_penalty: 0.0 ret_ckpt_path: null eval_num_retrieved: 100 - eval_num_cpus: 12 - eval_num_theorems: 0 + eval_num_workers: 8 + eval_num_gpus: 1 + eval_num_theorems: 400 data: data_path: data/leandojo_benchmark_4/random/ diff --git a/generator/model.py b/generator/model.py index a26c985..e4a2983 100644 --- a/generator/model.py +++ b/generator/model.py @@ -83,7 +83,8 @@ def __init__( warmup_steps: int, num_beams: int, eval_num_retrieved: int, - eval_num_cpus: int, + eval_num_workers: int, + eval_num_gpus: int, eval_num_theorems: int, max_inp_seq_len: int, max_oup_seq_len: int, @@ -97,7 +98,8 @@ def __init__( self.num_beams = num_beams self.length_penalty = length_penalty self.eval_num_retrieved = eval_num_retrieved - self.eval_num_cpus = eval_num_cpus + self.eval_num_workers = eval_num_workers + self.eval_num_gpus = eval_num_gpus self.eval_num_theorems = eval_num_theorems self.max_inp_seq_len = max_inp_seq_len self.max_oup_seq_len = max_oup_seq_len @@ -244,12 +246,14 @@ def on_validation_epoch_end(self) -> None: ckpt_path = f"{self.trainer.log_dir}/checkpoints/last.ckpt" self.trainer.save_checkpoint(ckpt_path) logger.info(f"Saved checkpoint to {ckpt_path}. Evaluating...") + torch.cuda.empty_cache() data_path = self.trainer.datamodule.data_path if self.retriever is None: acc = evaluate( data_path=data_path, - num_cpus=self.eval_num_cpus, + num_workers=self.eval_num_workers, + num_gpus=self.eval_num_gpus, num_theorems=self.eval_num_theorems, ckpt_path=ckpt_path, ) @@ -264,7 +268,8 @@ def on_validation_epoch_end(self) -> None: ) acc = evaluate( data_path=data_path, - num_cpus=self.eval_num_cpus, + num_workers=self.eval_num_workers, + num_gpus=self.eval_num_gpus, num_theorems=self.eval_num_theorems, ckpt_path=ckpt_path, indexed_corpus_path=corpus_path, diff --git a/prover/proof_search.py b/prover/proof_search.py index 8eb33cc..33caf1c 100644 --- a/prover/proof_search.py +++ b/prover/proof_search.py @@ -401,7 +401,6 @@ def __init__( ) return - ray.init() if num_gpus >= 1: logger.info(f"Launching {num_workers} workers with {num_gpus} GPUs.") num_gpus_per_worker = num_gpus / num_workers