From fa1e98c54c2e74cd35da3e4102b0aa93ea33b474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Fri, 17 May 2024 21:15:41 +0200 Subject: [PATCH] Add initial prototype for scored-docs access into ir_datasets #1 --- data/ir_datasets_scored_docs.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100755 data/ir_datasets_scored_docs.py diff --git a/data/ir_datasets_scored_docs.py b/data/ir_datasets_scored_docs.py new file mode 100755 index 0000000..9286c87 --- /dev/null +++ b/data/ir_datasets_scored_docs.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +from ir_datasets.util import RequestsDownload, Cache, home_path +from ir_datasets.formats import TrecScoredDocs + + +def scored_docs(rank_distill_llm_run='__rankzephyr-colbert-10000-sampled-100__msmarco-passage-train-judged.run'): + base_path = home_path() / 'rank-disti-llm' + requests_download = RequestsDownload(f'https://zenodo.org/records/11147862/files/{rank_distill_llm_run}?download=1') + scored_docs = TrecScoredDocs(Cache(requests_download, base_path/rank_distill_llm_run)) + + return scored_docs + +if __name__ == '__main__': + for i in scored_docs().scoreddocs_iter(): + print(i)