diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 2c0f5def9..f06574c99 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -361,9 +361,11 @@ def collate_fn(self, data): "cont_byte_len": torch.LongTensor(cont_byte_lens), "input_ids": torch.stack(queries), "dc_input_ids": torch.stack(dc_queries), - "label_id": torch.LongTensor(label_ids), } + if not isinstance(label_ids, str): + batch["label_id"] = torch.LongTensor(label_ids) + return batch def token_encode(self, string: str) -> List[int]: @@ -1538,7 +1540,7 @@ def prep_examples(self): label_id = request["label"] cont_id = request["idx"] if self.metric_type in ["ce_loss", "bpb"]: - if label_id != cont_id: + if label_id != cont_id and not isinstance(label_id, str): # Skip non-target continuations for ce_loss and bpb continue else: @@ -1758,6 +1760,7 @@ def doc_to_label(self, doc) -> int: "csqa_rc_0shot_bpb": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_0shot", "metric_type": "bpb"}), "csqa_rc_5shot": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_5shot", "metric_type": "len_norm"}), "csqa_rc_5shot_bpb": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_5shot", "metric_type": "bpb"}), + "gsm8k_gold_bpb_5shot": (OEEvalTask, {"dataset_path": "gsm8k", "dataset_name": "gold_bpb_5shot", "metric_type": "bpb"}), "hellaswag_mc_5shot": ( OEEvalTask, {"dataset_path": "hellaswag", "dataset_name": "mc_5shot", "metric_type": "acc"}, diff --git a/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/config.json b/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/config.json index 42fe92b3f..40cbee3b6 100644 --- a/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/config.json +++ b/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/config.json @@ -1 +1 @@ -{"task_name": "gsm8k", "task_hash": "c9a8b5bfa866f678c3ea4ef06729f149", "task_config": {"task_name": "gsm8k", "task_core": "gsm8k", "limit": null, "split": "test", "num_shots": 8, "fewshot_seed": 1234, "primary_metric": "logits_per_byte", "random_subsample_seed": 1234, "context_kwargs": {"no_cot": false}, "generation_kwargs": {"max_gen_toks": 512, "do_sample": false, "temperature": 0.0, "stop_sequences": ["Question:", "", "<|im_end|>", "\n\n"], "repeats": 1}, "metric_kwargs": {"regexes_to_ignore": [",", "\\$", "(?s).*#### ", "\\.$"]}, "native_id_field": "id", "fewshot_source": "STD:GSM8k", "dataset_path": "gsm8k", "dataset_name": "main", "use_chat_format": null, "version": 0.1, "revision": null, "compute_gold_bpb": true, "metadata": {"alias": "gsm8k::bpb"}}, "current_date": "2025-01-08 21:03:44 UTC", "num_instances": 1319} \ No newline at end of file +{"task_name": "gsm8k", "task_hash": "c9a8b5bfa866f678c3ea4ef06729f149", "task_config": {"task_name": "gsm8k", "task_core": "gsm8k", "limit": null, "split": "test", "num_shots": 8, "fewshot_seed": 1234, "primary_metric": "logits_per_byte", "random_subsample_seed": 1234, "context_kwargs": {"no_cot": false}, "generation_kwargs": {"max_gen_toks": 512, "do_sample": false, "temperature": 0.0, "stop_sequences": ["Question:", "", "<|im_end|>", "\n\n"], "repeats": 1}, "metric_kwargs": {"regexes_to_ignore": [",", "\\$", "(?s).*#### ", "\\.$"]}, "native_id_field": "id", "fewshot_source": "STD:GSM8k", "dataset_path": "gsm8k", "dataset_name": "main", "use_chat_format": null, "version": 0.1, "revision": null, "compute_gold_bpb": true, "metadata": {"alias": "gsm8k::bpb"}}, "current_date": "2025-01-08 21:30:11 UTC", "num_instances": 1319} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/requests.jsonl.gz index d65ee84c3..6803a63d8 100644 Binary files a/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/requests.jsonl.gz and b/olmo_data/oe_eval_tasks/gsm8k/gold_bpb_5shot/requests.jsonl.gz differ