forked from intel/neural-compressor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_squad.py
117 lines (94 loc) · 4.12 KB
/
evaluate_squad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Official evaluation script for v1.1 of the SQuAD dataset.
From https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
"""
from __future__ import print_function
import sys
from collections import Counter
from .f1 import normalize_answer
def f1_score(prediction, ground_truth):
"""Calculate the F1 score of the prediction and the ground_truth.
Args:
prediction: The predicted result.
ground_truth: The ground truth.
Returns:
The F1 score of prediction. Float point number.
"""
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Calculate the max metric for each ground truth.
For each answer in ground_truths, evaluate the metric of prediction with
this answer, and return the max metric.
Args:
metric_fn: The function to calculate the metric.
prediction: The prediction result.
ground_truths: A list of correct answers.
Returns:
The max metric. Float point number.
"""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def exact_match_score(prediction, ground_truth):
"""Compute the exact match score between prediction and ground truth.
Args:
prediction: The result of predictions to be evaluated.
ground_truth: The ground truth.
Returns:
The exact match score.
"""
return normalize_answer(prediction) == normalize_answer(ground_truth)
def evaluate(dataset, predictions):
"""Evaluate the average F1 score and the exact match score for Question-Answering results.
Args:
dataset: The dataset to evaluate the prediction. A list instance of articles.
An article contains a list of paragraphs, a paragraph contains a list of
question-and-answers (qas), and a question-and-answer contains an id, a question,
and a list of correct answers. For example:
predictions: The result of predictions to be evaluated. A dict mapping the id of
a question to the predicted answer of the question.
Returns:
The F1 score and the exact match score.
"""
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article["paragraphs"]:
for qa in paragraph["qas"]:
total += 1
if qa["id"] not in predictions:
message = "Unanswered question " + qa["id"] + " will receive score 0."
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x["text"], qa["answers"]))
prediction = predictions[qa["id"]]
exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {"exact_match": exact_match, "f1": f1}