Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] distribution.py: Fix computation on multiclass data #2903

Merged
merged 10 commits into from
Feb 26, 2018
6 changes: 4 additions & 2 deletions Orange/data/_valuecount.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ from numpy cimport NPY_FLOAT64 as NPY_float64
@cython.boundscheck(False)
@cython.wraparound(False)
def valuecount(np.ndarray[np.float64_t, ndim=2] a not None):
"""
Count the occurrences of each value.
"""Count the occurrences of each value.

It does so in-place, on a 2-d array of shape (2, N); the first row
contains values and the second contains weights (1's, if unweighted).
Expand All @@ -22,10 +21,13 @@ def valuecount(np.ndarray[np.float64_t, ndim=2] a not None):
consecutive columns with the same value in the first row, and adding
the corresponding weights in the second row.

Examples
--------
>>> a = np.array([[1, 1, 2, 3, 3], [0.1, 0.2, 0.3, 0.4, 0.5]])
>>> _orange.valuecount(a)
[[ 1. 2. 3. ]
[ 0.3 0.3 0.9]]

"""
cdef np.npy_intp *dim
dim = np.PyArray_DIMS(a)
Expand Down
108 changes: 46 additions & 62 deletions Orange/statistics/contingency.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,39 @@
import math
import numpy as np

from Orange import data


def _get_variable(variable, dat, attr_name,
expected_type=None, expected_name=""):
def _get_variable(variable, dat, attr_name, expected_type=None, expected_name=""):
failed = False
if isinstance(variable, data.Variable):
datvar = getattr(dat, "variable", None)
if datvar is not None and datvar is not variable:
raise ValueError("variable does not match the variable"
"in the data")
raise ValueError("variable does not match the variable in the data")
elif hasattr(dat, "domain"):
variable = dat.domain[variable]
elif hasattr(dat, attr_name):
variable = dat.variable
else:
failed = True
if failed or (expected_type is not None and
not isinstance(variable, expected_type)):
if failed or (expected_type is not None and not isinstance(variable, expected_type)):
if not expected_type or isinstance(variable, data.Variable):
raise ValueError(
"expected %s variable not %s" % (expected_name, variable))
raise ValueError("expected %s variable not %s" % (expected_name, variable))
else:
raise ValueError("expected %s, not '%s'" %
(expected_type.__name__, type(variable).__name__))
raise ValueError("expected %s, not '%s'" % (
expected_type.__name__, type(variable).__name__))
return variable


def create_discrete(cls, *args):
def _create_discrete(cls, *args):
return cls(*args)


class Discrete(np.ndarray):
def __new__(cls, dat=None,
col_variable=None, row_variable=None,
def __new__(cls, dat=None, col_variable=None, row_variable=None,
unknowns=None, unknown_rows=None):
if isinstance(dat, data.Storage):
if unknowns is not None:
raise TypeError(
"incompatible arguments (data storage and 'unknowns'")
raise TypeError("incompatible arguments (data storage and 'unknowns'")
return cls.from_data(dat, col_variable, row_variable)

if row_variable is not None:
Expand All @@ -62,24 +56,20 @@ def __new__(cls, dat=None,
self.unknown_rows = unknown_rows or 0
else:
self[...] = dat
self.unknowns = (unknowns if unknowns is not None
else getattr(dat, "unknowns", 0))
self.unknowns = unknowns if unknowns is not None else getattr(dat, "unknowns", 0)
self.unknown_rows = unknown_rows if unknown_rows is not None else 0
return self


@classmethod
def from_data(cls, data, col_variable, row_variable=None):
if row_variable is None:
row_variable = data.domain.class_var
if row_variable is None:
raise ValueError("row_variable needs to be specified (data "
"has no class)")
raise ValueError("row_variable needs to be specified (data has no class)")
row_variable = _get_variable(row_variable, data, "row_variable")
col_variable = _get_variable(col_variable, data, "col_variable")
try:
conts, unknown_rows = data._compute_contingency(
[col_variable], row_variable)
conts, unknown_rows = data._compute_contingency([col_variable], row_variable)
dist, unknowns = conts[0]

self = super().__new__(cls, dist.shape)
Expand All @@ -97,23 +87,22 @@ def from_data(cls, data, col_variable, row_variable=None):
for row in data:
rval, cval = row[rind], row[cind]
w = row.weight
if math.isnan(rval):
if np.isnan(rval):
self.unknown_rows += w
continue
if math.isnan(cval):
if np.isnan(cval):
self.unknowns[cval] += w
else:
self[int(rval), int(cval)] += w
self.row_variable = row_variable
self.col_variable = col_variable
return self


def __eq__(self, other):
return np.array_equal(self, other) and (
not hasattr(other, "unknowns") or
np.array_equal(self.unknowns, other.unknowns))

return (
np.array_equal(self, other) and
(not hasattr(other, "unknowns") or np.array_equal(self.unknowns, other.unknowns))
)

def __getitem__(self, index):
if isinstance(index, str):
Expand Down Expand Up @@ -150,7 +139,6 @@ def __setitem__(self, index, value):
index = (index[0], self.col_variable.to_val(index[1]))
super().__setitem__(index, value)


def normalize(self, axis=None):
t = np.sum(self, axis=axis)
if t > 1e-6:
Expand All @@ -159,18 +147,18 @@ def normalize(self, axis=None):
self.unknowns /= t

def __reduce__(self):
return create_discrete, (Discrete, np.copy(self),
self.col_variable, self.row_variable,
self.unknowns)
return (
_create_discrete,
(Discrete, np.copy(self), self.col_variable, self.row_variable, self.unknowns)
)


class Continuous:
def __init__(self, dat=None, col_variable=None, row_variable=None,
unknowns=None, unknown_rows=None):
if isinstance(dat, data.Storage):
if unknowns is not None:
raise TypeError(
"incompatible arguments (data storage and 'unknowns'")
raise TypeError("incompatible arguments (data storage and 'unknowns'")
return self.from_data(dat, col_variable, row_variable)

if row_variable is not None:
Expand All @@ -195,30 +183,27 @@ def __init__(self, dat=None, col_variable=None, row_variable=None,
else:
self.unknown_rows = None


def from_data(self, data, col_variable, row_variable=None):
if row_variable is None:
row_variable = data.domain.class_var
if row_variable is None:
raise ValueError("row_variable needs to be specified (data"
"has no class)")
raise ValueError("row_variable needs to be specified (data has no class)")
self.row_variable = _get_variable(row_variable, data, "row_variable")
self.col_variable = _get_variable(col_variable, data, "col_variable")
try:
conts, self.unknown_rows = data._compute_contingency(
[col_variable], row_variable)
conts, self.unknown_rows = data._compute_contingency([col_variable], row_variable)
(self.values, self.counts), self.unknowns = conts[0]
except NotImplementedError:
raise NotImplementedError("Fallback method for computation of "
"contingencies is not implemented yet")

raise NotImplementedError(
"Fallback method for computation of contingencies is not implemented yet"
)

def __eq__(self, other):
return (np.array_equal(self.values, other.values) and
np.array_equal(self.counts, other.counts) and
(not hasattr(other, "unknowns") or
np.array_equal(self.unknowns, other.unknowns)))

return (
np.array_equal(self.values, other.values) and
np.array_equal(self.counts, other.counts) and
(not hasattr(other, "unknowns") or np.array_equal(self.unknowns, other.unknowns))
)

def __getitem__(self, index):
""" Return contingencies for a given class value. """
Expand All @@ -228,15 +213,14 @@ def __getitem__(self, index):
ind = C > 0
return np.vstack((self.values[ind], C[ind]))


def __len__(self):
return self.counts.shape[0]


def __setitem__(self, index, value):
raise NotImplementedError("Setting individual class contingencies is "
"not implemented yet. Set .values and .counts.")

raise NotImplementedError(
"Setting individual class contingencies is not implemented yet. "
"Set .values and .counts."
)

def normalize(self, axis=None):
if axis is None:
Expand All @@ -245,8 +229,9 @@ def normalize(self, axis=None):
for x in self:
x[:, 1] /= t
elif axis != 1:
raise ValueError("contingencies can be normalized only with axis=1"
" or without axis")
raise ValueError(
"contingencies can be normalized only with axis=1 or without axis"
)
else:
for i, x in enumerate(self):
t = np.sum(x[:, 1])
Expand All @@ -265,20 +250,19 @@ def get_contingency(dat, col_variable, row_variable=None, unknowns=None, unknown
elif variable.is_continuous:
return Continuous(dat, col_variable, row_variable, unknowns, unknown_rows)
else:
raise TypeError("cannot compute distribution of '%s'" %
type(variable).__name__)
raise TypeError("cannot compute distribution of '%s'" % type(variable).__name__)


def get_contingencies(dat, skipDiscrete=False, skipContinuous=False):
def get_contingencies(dat, skip_discrete=False, skip_continuous=False):
vars = dat.domain.attributes
row_var = dat.domain.class_var
if row_var is None:
raise ValueError("data has no target variable")
if skipDiscrete:
if skipContinuous:
if skip_discrete:
if skip_continuous:
return []
columns = [i for i, var in enumerate(vars) if var.is_continuous]
elif skipContinuous:
elif skip_continuous:
columns = [i for i, var in enumerate(vars) if var.is_discrete]
else:
columns = None
Expand Down
Loading