-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbase.py
321 lines (283 loc) · 11.7 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import numpy as np
from math import ceil
from scipy.special import binom
class BaseComparisons(object):
"""Base class to compare arbitrary numbers of fingerprints.
Attributes
----------
fingerprints : {np.ndrarray, int}
Numpy array with the fingerprints that will be compared.
The fingerprints must be also given as Numpy arrays.
If an int is given it assumes that one is comparing
n random fingerprints of infinite length.
c_threshold : {None, 'dissimilar', int}
Coincidence threshold.
Properties
----------
n_fingerprints : int
Number of fingerprints that will be compared.
Methods
-------
__init__(self, fingerprints, c_threshold=None, w_factor="fraction")
Initialize the object.
assign_fingerprints(fingerprints)
Assign fingerprints.
assign_c_threshold(c_threshold)
Assign coincidence threshold.
matches()
Calculate the matches between the fingerprints.
set_d_vector()
Calculate the d vector.
set_w_factor(w_factor)
Calculate weight factors.
set_weighted_matches()
Calculate weighted matches.
set_a()
Calculate the (unweighted) 1-similarity counter.
set_d()
Calculate the (unweighted) 0-similarity counter.
set_weighted_a()
Calculate the (weighted) 1-similarity counter.
set_weighted_d()
Calculate the (weighted) 0-similarity counter.
set_dis_counters()
Calculate the (unweighted) dissimilarity counters.
set_weighted_dis_counters()
Calculate the (weighted) dissimilarity counters.
set_total_sim_counter()
Calculate the total number of (unweighted) similarity counters.
set_total_weighted_sim_counter()
Calculate the total number of (unweighted) similarity counters.
total_dis_counters()
Calculate total number of (unweighted) dissimilarity counters.
total_weighted_dis_counters()
Calculate total number of (weighted) dissimilarity counters.
set_p()
Calculate p.
set_weighted_p()
Calculate weighted p.
"""
def __init__(self, fingerprints, c_threshold=None, w_factor="fraction"):
"""Initialize the object.
Parameters
----------
fingerprints : {np.ndrarray, int}
Numpy array with the fingerprints that will be compared.
The fingerprints must be also given as Numpy arrays.
If an int is given it assumes that one is comparing
n random fingerprints of infinite length.
c_threshold : {None, 'dissimilar', int}
Coincidence threshold.
w_factor : {"fraction", "power_n"}
Type of weight function that will be used.
"""
self.assign_fingerprints(fingerprints)
self.assign_c_threshold(c_threshold)
self.set_matches()
self.set_d_vector()
self.set_w_factor(w_factor)
self.set_weighted_matches()
self.set_a()
self.set_d()
self.set_weighted_a()
self.set_weighted_d()
self.set_dis_counters()
self.set_weighted_dis_counters()
self.set_total_sim_counter()
self.set_total_weighted_sim_counter()
self.total_dis_counters()
self.total_weighted_dis_counters()
self.set_p()
self.set_weighted_p()
@property
def n_fingerprints(self):
"""Return number of fingerprints.
Returns
-------
n_fingerprints : int
Number of fingerprints that will be compared.
Note: If fingerprints is an int this is taken as the number of fingerprints
that will be compared.
"""
if isinstance(self.fingerprints, int):
return self.fingerprints
else:
return len(self.fingerprints)
def assign_fingerprints(self, fingerprints):
"""Assign fingerprints.
Parameters
----------
fingerprints : {np.ndrarray, int}
Numpy array with the fingerprints that will be compared.
The fingerprints must be also given as Numpy arrays.
If an int is given it assumes that one is comparing
n random fingerprints of infinite length.
Raises
------
TypeError
If fingerprints is not a numpy array.
If the elements of fingerprints are not numpy arrays.
ValueError
If fingerprints is not a positive integer.
If less than two fingerprints are provided.
If not all the fingerprints have the same length.
"""
if isinstance(fingerprints, int):
if fingerprints <= 0:
raise ValueError("If fingerprints is given as an integer,"
"it should be positive integer")
self.fingerprints = fingerprints
else:
if not isinstance(fingerprints, np.ndarray):
raise TypeError("Fingerprints must be a numpy array or an int.")
if not all(isinstance(fingerprint, np.ndarray) for fingerprint in fingerprints):
raise TypeError("The elements of fingerprints must be a numpy array.")
if len(fingerprints) < 2:
raise ValueError("A minimum of 2 fingerprints must be provided.")
if not all([len(fingerprint) == len(fingerprints[0]) for fingerprint in fingerprints]):
raise ValueError("All the fingerprints must have the same length.")
self.fingerprints = fingerprints
def assign_c_threshold(self, c_threshold):
"""Assign coincidence threshold.
Parameters
----------
c_threshold : {None, 'dissimilar', int}
Coincidence threshold.
None : Default, c_threshold = n_fingerprints % 2
'dissimilar' : c_threshold = ceil(n_fingerprints / 2)
int : Integer number < n_fingerprints
Raises
------
TypeError
If c_threshold is not None, 'dissimilar', or an integer.
ValueError
If c_threshold is an integer equal or greater than n_fingerprints
"""
if not c_threshold:
self.c_threshold = self.n_fingerprints % 2
if isinstance(c_threshold, str):
if c_threshold != 'dissimilar':
raise TypeError("c_threshold must be None, 'dissimilar', or an integer.")
else:
self.c_threshold = ceil(self.n_fingerprints / 2)
if isinstance(c_threshold, int):
if c_threshold >= self.n_fingerprints:
raise ValueError("c_threshold cannot be equal or greater than n_fingerprints.")
self.c_threshold = c_threshold
def set_matches(self):
"""Calculate the matches between the fingerprints."""
if isinstance(self.fingerprints, int):
matches = [int(binom(self.n_fingerprints, k)) for k in range(self.n_fingerprints + 1)]
else:
c_total = np.sum(self.fingerprints, axis=0)
matches = (self.n_fingerprints + 1) * [0]
for i in range(self.n_fingerprints + 1):
matches[i] = np.count_nonzero(c_total == i)
self.matches = np.array(matches)
def set_d_vector(self):
"""Calculate the d vector.
Notes
-----
The entries of this vector are the numbers |2k - n_fingerprints|,
which measure the degree of coincidence between the given fingerprints.
"""
self.d_vector = np.array([abs(2 * k - self.n_fingerprints) for k in range(
self.n_fingerprints + 1)])
def set_w_factor(self, w_factor):
"""Calculate weight factors.
Parameters
----------
w_factor : {"fraction", "power_n"}
Type of weight function that will be used.
'fraction' : similarity = d[k]/n
dissimilarity = 1 - (d[k] - n_fingerprints % 2)/n_fingerprints
'power_n' : similarity = n**-(n_fingerprints - d[k])
dissimilarity = n**-(d[k] - n_fingerprints % 2)
other values : similarity = dissimilarity = 1
"""
if w_factor == "power_n":
power = int(w_factor.split("_")[-1])
def f_s(d):
return power**-(self.n_fingerprints - d)
def f_d(d):
return power**-(d - self.n_fingerprints % 2)
elif w_factor == "fraction":
def f_s(d):
return d/self.n_fingerprints
def f_d(d):
return 1 - (d - self.n_fingerprints % 2)/self.n_fingerprints
else:
def f_s(d):
return 1
def f_d(d):
return 1
weights = (self.n_fingerprints + 1) * [0]
for k in range(self.n_fingerprints + 1):
if self.d_vector[k] > self.c_threshold:
weights[k] = f_s(self.d_vector[k])
else:
weights[k] = f_d(self.d_vector[k])
self.weights = np.array(weights)
def set_weighted_matches(self):
"""Calculate weighted matches."""
self.weighted_matches = self.matches * self.weights
def set_a(self):
"""Calculate the (unweighted) 1-similarity counter."""
a = 0
for k in range(self.n_fingerprints + 1):
if 2 * k - self.n_fingerprints > self.c_threshold:
a += self.matches[k]
self.a = a
def set_d(self):
"""Calculate the (unweighted) 0-similarity counter."""
d = 0
for k in range(self.n_fingerprints + 1):
if self.n_fingerprints - 2 * k > self.c_threshold:
d += self.matches[k]
self.d = d
def set_weighted_a(self):
"""Calculate the (weighted) 1-similarity counter."""
w_a = 0
for k in range(self.n_fingerprints + 1):
if 2 * k - self.n_fingerprints > self.c_threshold:
w_a += self.weighted_matches[k]
self.w_a = w_a
def set_weighted_d(self):
"""Calculate the (weighted) 0-similarity counter."""
w_d = 0
for k in range(self.n_fingerprints + 1):
if self.n_fingerprints - 2 * k > self.c_threshold:
w_d += self.weighted_matches[k]
self.w_d = w_d
def set_dis_counters(self):
"""Calculate the (unweighted) dissimilarity counters."""
d_counters = []
for k in range(self.n_fingerprints + 1):
if self.d_vector[k] <= self.c_threshold:
d_counters.append(self.matches[k])
self.dis_counters = np.array(d_counters)
def set_weighted_dis_counters(self):
"""Calculate the (weighted) dissimilarity counters."""
w_d_counters = []
for k in range(self.n_fingerprints + 1):
if self.d_vector[k] <= self.c_threshold:
w_d_counters.append(self.weighted_matches[k])
self.w_dis_counters = np.array(w_d_counters)
def set_total_sim_counter(self):
"""Calculate the total number of (unweighted) similarity counters."""
self.total_sim = self.a + self.d
def set_total_weighted_sim_counter(self):
"""Calculate the total number of (weighted) similarity counters."""
self.total_w_sim = self.w_a + self.w_d
def total_dis_counters(self):
"""Calculate total number of (unweighted) dissimilarity counters."""
self.total_dis = np.sum(self.dis_counters)
def total_weighted_dis_counters(self):
"""Calculate total number of (weighted) dissimilarity counters."""
self.total_w_dis = np.sum(self.w_dis_counters)
def set_p(self):
"""Calculate p."""
self.p = self.total_sim + self.total_dis
def set_weighted_p(self):
"""Calculate weighted p."""
self.w_p = self.total_w_sim + self.total_w_dis