-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstatistics.py
executable file
·73 lines (66 loc) · 2.45 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
import sys
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"-o", "--out", type=argparse.FileType('w', encoding='UTF-8'),
default=sys.stdout, help="the txt filename to save the data to"
)
parser.add_argument(
"-s", "--sorted", action='store_true', help="whether the data is already sorted by its probabilities; the second column will be ignored"
)
parser.add_argument(
"-f", "--flip", action='store_true', help="whether to flip the probabilities"
)
parser.add_argument(
"--flip--sorted", action='store_true', help="whether to only flip the probabilities (this is the same as -f and -s specified together and overrides the others)"
)
parser.add_argument(
"-r", "--roc", action='store_true', help="create roc (instead of prc) data"
)
parser.add_argument(
"table", nargs="?", default=sys.stdin,
help="a two column (truth/probs) table of variant classifications w/o a header"
)
args = parser.parse_args()
if args.flip__sorted:
args.flip = True
args.sorted = True
if args.table == '':
args.table = sys.stdin
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
# read the file into a pandas data frame
df = pd.read_csv(
args.table, sep='\t', header=None, names=['truth', 'probs'],
index_col=False, dtype={'probs': np.float_, 'truth': np.bool_},
low_memory=False, na_values='.', usecols=['truth', 'probs'][:2-args.sorted]
)
df.fillna(0, inplace=True)
if args.sorted:
print("Input already sorted.", file=sys.stderr)
scores = df.index/df.index[-1]
# predictions are already technically inverted
if not args.flip:
scores = 1-scores
else:
print("Inverting predictions.", file=sys.stderr)
else:
# replace inf values with a number 1 larger than the next largest value
if df['probs'].max() == np.float_('inf'):
df['probs'] = df['probs'].replace(
np.float_('inf'), np.sort(df['probs'].unique())[-2]+1
)
# turn the scores into probabilities if they're not already
scores = df['probs']/df['probs'].max()
if args.flip:
print("Inverting predictions.", file=sys.stderr)
scores = 1-scores
if args.roc:
fpr, tpr, thresh = roc_curve(df['truth'], scores)
np.savetxt(args.out, np.array([fpr, tpr]))
else:
precision, recall, thresh = precision_recall_curve(df['truth'], scores)
np.savetxt(args.out, np.array([recall, precision]))