Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Feature constructor optimization #5975

Merged
merged 3 commits into from
May 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions Orange/data/variable.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
import warnings
from collections.abc import Iterable
from typing import Sequence

from datetime import datetime, timedelta, timezone
from numbers import Number, Real, Integral
Expand Down Expand Up @@ -168,6 +169,44 @@ def __new__(cls, variable, value=Unknown):
self._value = value
return self

@staticmethod
def _as_values_primitive(variable, data) -> Sequence['Value']:
assert variable.is_primitive()
_Value = Value
_float_new = float.__new__
res = [Value(variable, np.nan)] * len(data)
for i, v in enumerate(data):
v = _float_new(_Value, v)
v.variable = variable
res[i] = v
return res

@staticmethod
def _as_values_non_primitive(variable, data) -> Sequence['Value']:
assert not variable.is_primitive()
_Value = Value
_float_new = float.__new__
data_arr = np.array(data, dtype=object)
NA = data_arr == variable.Unknown
fdata = np.full(len(data), np.finfo(float).min)
fdata[NA] = np.nan
res = [Value(variable, Variable.Unknown)] * len(data)
for i, (v, fval) in enumerate(zip(data, fdata)):
val = _float_new(_Value, fval)
val.variable = variable
val._value = v
res[i] = val
return res

@staticmethod
def _as_values(variable, data):
"""Equivalent but faster then `[Value(variable, v) for v in data]
"""
if variable.is_primitive():
return Value._as_values_primitive(variable, data)
else:
return Value._as_values_non_primitive(variable, data)

def __init__(self, _, __=Unknown):
# __new__ does the job, pylint: disable=super-init-not-called
pass
Expand Down
10 changes: 10 additions & 0 deletions Orange/tests/test_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,13 @@ def test_hash(self):
self.assertTrue(val == v and hash(val) == hash(v))
val = Value(DiscreteVariable("var", ["red", "green", "blue"]), 1)
self.assertRaises(TypeError, hash, val)

def test_as_values(self):
x = ContinuousVariable("x")
values = Value._as_values(x, [0., 1., 2.]) # pylint: disable=protected-access
self.assertIsInstance(values[0], Value)
self.assertEqual(values[0], 0)
s = StringVariable("s")
values = Value._as_values(s, ["a", "b", ""]) # pylint: disable=protected-access
self.assertIsInstance(values[0], Value)
self.assertEqual(values[0], "a")
19 changes: 1 addition & 18 deletions Orange/widgets/data/oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

from Orange.preprocess.transformation import Transformation, Identity, Lookup
from Orange.widgets import widget, gui, settings
from Orange.widgets.utils import itemmodels
from Orange.widgets.utils import itemmodels, ftry
from Orange.widgets.utils.buttons import FixedSizeButton
from Orange.widgets.utils.itemmodels import signal_blocking
from Orange.widgets.utils.widgetpreview import WidgetPreview
Expand All @@ -50,8 +50,6 @@
MArray = np.ma.MaskedArray
DType = Union[np.dtype, type]

A = TypeVar("A") # pylint: disable=invalid-name
B = TypeVar("B") # pylint: disable=invalid-name
V = TypeVar("V", bound=Orange.data.Variable) # pylint: disable=invalid-name
H = TypeVar("H", bound=Hashable) # pylint: disable=invalid-name

Expand Down Expand Up @@ -2631,21 +2629,6 @@ def apply_transform_string(var, trs):
return variable


def ftry(
func: Callable[..., A],
error: Union[Type[BaseException], Tuple[Type[BaseException]]],
default: B
) -> Callable[..., Union[A, B]]:
"""
Wrap a `func` such that if `errors` occur `default` is returned instead."""
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except error:
return default
return wrapper


class DictMissingConst(dict):
"""
`dict` with a constant for `__missing__()` value.
Expand Down
101 changes: 66 additions & 35 deletions Orange/widgets/data/owfeatureconstructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from traceback import format_exception_only
from collections import namedtuple, OrderedDict
from itertools import chain, count
from itertools import chain, count, starmap
from typing import List, Dict, Any

import numpy as np
Expand All @@ -32,15 +32,19 @@
from orangewidget.utils.combobox import ComboBoxSearch

import Orange
from Orange.data import Variable, Table, Value, Instance
from Orange.data.util import get_unique_names
from Orange.widgets import gui
from Orange.widgets.settings import ContextSetting, DomainContextHandler
from Orange.widgets.utils import itemmodels, vartype
from Orange.widgets.utils import (
itemmodels, vartype, ftry, unique_everseen as unique
)
from Orange.widgets.utils.sql import check_sql_input
from Orange.widgets import report
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import OWWidget, Msg, Input, Output


FeatureDescriptor = \
namedtuple("FeatureDescriptor", ["name", "expression"])

Expand Down Expand Up @@ -729,11 +733,14 @@ def duplicateFeature(self):

@staticmethod
def check_attrs_values(attr, data):
for i in range(len(data)):
for var in attr:
if not math.isnan(data[i, var]) \
and int(data[i, var]) >= len(var.values):
return var.name
for var in attr:
col, _ = data.get_column_view(var)
mask = ~np.isnan(col)
grater_or_equal = np.greater_equal(
col, len(var.values), out=mask, where=mask
)
if grater_or_equal.any():
return var.name
return None

def _validate_descriptors(self, desc):
Expand Down Expand Up @@ -1162,25 +1169,59 @@ def __init__(self, expression, args, extra_env=None, cast=None, use_values=False
self.mask_exceptions = True
self.use_values = use_values

def __call__(self, instance, *_):
if isinstance(instance, Orange.data.Table):
return [self(inst) for inst in instance]
def __call__(self, table, *_):
if isinstance(table, Table):
return self.__call_table(table)
else:
try:
args = [str(instance[var]) if var.is_string
else var.values[int(instance[var])] if var.is_discrete and not self.use_values
else instance[var]
for _, var in self.args]
y = self.func(*args)
# user's expression can contain arbitrary errors
# this also covers missing attributes
except: # pylint: disable=bare-except
if not self.mask_exceptions:
raise
return np.nan
if self.cast:
y = self.cast(y)
return y
return self.__call_instance(table)

def __call_table(self, table):
try:
cols = [self.extract_column(table, var) for _, var in self.args]
except ValueError:
if self.mask_exceptions:
return np.full(len(table), np.nan)
else:
raise

if not cols:
args = [()] * len(table)
else:
args = zip(*cols)
f = self.func
if self.mask_exceptions:
y = list(starmap(ftry(f, Exception, np.nan), args))
else:
y = list(starmap(f, args))
if self.cast is not None:
cast = self.cast
y = [cast(y_) for y_ in y]
return y

def __call_instance(self, instance: Instance):
table = Table.from_numpy(
instance.domain,
np.array([instance.x]),
np.array([instance.y]),
np.array([instance.metas]),
)
return self.__call_table(table)[0]

def extract_column(self, table: Table, var: Variable):
data, _ = table.get_column_view(var)
if var.is_string:
return list(map(var.str_val, data))
elif var.is_discrete and not self.use_values:
values = np.array([*var.values, None], dtype=object)
idx = data.astype(int)
idx[~np.isfinite(data)] = len(values) - 1
return values[idx].tolist()
elif var.is_time: # time always needs Values due to str(val) formatting
return Value._as_values(var, data.tolist()) # pylint: disable=protected-access
elif not self.use_values:
return data.tolist()
else:
return Value._as_values(var, data.tolist()) # pylint: disable=protected-access

def __reduce__(self):
return type(self), (self.expression, self.args,
Expand All @@ -1190,15 +1231,5 @@ def __repr__(self):
return "{0.__name__}{1!r}".format(*self.__reduce__())


def unique(seq):
seen = set()
unique_el = []
for el in seq:
if el not in seen:
unique_el.append(el)
seen.add(el)
return unique_el


if __name__ == "__main__": # pragma: no cover
WidgetPreview(OWFeatureConstructor).run(Orange.data.Table("iris"))
6 changes: 6 additions & 0 deletions Orange/widgets/data/tests/test_owfeatureconstructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,12 @@ def test_missing_variable(self):
self.assertTrue(np.all(np.isnan(r)))
self.assertTrue(np.isnan(f(data2[0])))

def test_time_str(self):
data = Table.from_numpy(Domain([TimeVariable("T", have_date=True)]), [[0], [0]])
f = FeatureFunc("str(T)", [("T", data.domain[0])])
c = f(data)
self.assertEqual(c, ["1970-01-01", "1970-01-01"])

def test_invalid_expression_variable(self):
iris = Table("iris")
f = FeatureFunc("1 / petal_length",
Expand Down
20 changes: 19 additions & 1 deletion Orange/widgets/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from collections import deque
from typing import (
TypeVar, Callable, Any, Iterable, Optional, Hashable, Type, Union
TypeVar, Callable, Any, Iterable, Optional, Hashable, Type, Union, Tuple
)
from xml.sax.saxutils import escape

Expand Down Expand Up @@ -92,6 +92,8 @@ def qname(type_: type) -> str:

_T1 = TypeVar("_T1") # pylint: disable=invalid-name
_E = TypeVar("_E", bound=enum.Enum) # pylint: disable=invalid-name
_A = TypeVar("_A") # pylint: disable=invalid-name
_B = TypeVar("_B") # pylint: disable=invalid-name


def apply_all(seq, op):
Expand All @@ -101,6 +103,22 @@ def apply_all(seq, op):
deque(map(op, seq), maxlen=0)


def ftry(
func: Callable[..., _A],
error: Union[Type[BaseException], Tuple[Type[BaseException]]],
default: _B
) -> Callable[..., Union[_A, _B]]:
"""
Wrap a `func` such that if `errors` occur `default` is returned instead.
"""
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except error:
return default
return wrapper


def unique_everseen(iterable, key=None):
# type: (Iterable[_T1], Optional[Callable[[_T1], Hashable]]) -> Iterable[_T1]
"""
Expand Down