Skip to content

Commit

Permalink
Raise for unseen categories when materializing from an existing `Mode…
Browse files Browse the repository at this point in the history
…lSpec` (#341)

* Raise error on unseen levels when materializing

* Fix test for unseen categories

* Add test for raising on unseen categories

* Properly handle missings when checking for unseen

* Expand test for unseen missings

* Improve attribute name

* Add comment about dropping missings in tests for new levels
  • Loading branch information
stanmart authored Jan 25, 2024
1 parent 249b5e5 commit bd20e0d
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 5 deletions.
28 changes: 23 additions & 5 deletions src/tabmat/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def from_categorical(
reduced_rank: bool,
missing_method: str = "fail",
missing_name: str = "(MISSING)",
force_convert: bool = False,
add_missing_category: bool = False,
) -> "_InteractableCategoricalVector":
"""Create an interactable categorical vector from a pandas categorical."""
categories = list(cat.categories)
Expand All @@ -446,7 +446,7 @@ def from_categorical(
"if cat_missing_method='fail'."
)

if missing_method == "convert" and (-1 in codes or force_convert):
if missing_method == "convert" and (-1 in codes or add_missing_category):
codes[codes == -1] = len(categories)
categories.append(missing_name)

Expand Down Expand Up @@ -723,17 +723,35 @@ def encode_contrasts(
order to avoid spanning the intercept.
"""
levels = levels if levels is not None else _state.get("categories")
force_convert = _state.get("force_convert", False)
add_missing_category = _state.get("add_missing_category", False)

# Check for unseen categories when levels are specified
if levels is not None:
if missing_method == "convert" and not add_missing_category:
# We only need to include NAs in the check in this case because:
# - missing_method == "fail" raises a more appropriate error later
# - missings are no problem in the other cases
unseen_categories = set(data.unique()) - set(levels)
else:
unseen_categories = set(data.dropna().unique()) - set(levels)

if unseen_categories:
raise ValueError(
f"Column {data.name} contains unseen categories: {unseen_categories}."
)

cat = pandas.Categorical(data._values, categories=levels)
_state["categories"] = cat.categories
_state["force_convert"] = missing_method == "convert" and cat.isna().any()
_state["add_missing_category"] = add_missing_category or (
missing_method == "convert" and cat.isna().any()
)

return _InteractableCategoricalVector.from_categorical(
cat,
reduced_rank=reduced_rank,
missing_method=missing_method,
missing_name=missing_name,
force_convert=force_convert,
add_missing_category=add_missing_category,
)


Expand Down
55 changes: 55 additions & 0 deletions tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,61 @@ def test_cat_missing_interactions():
assert tm.from_formula(formula, df).column_names == expected_names


@pytest.mark.parametrize(
"cat_missing_method", ["zero", "convert", "fail"], ids=["zero", "convert", "fail"]
)
def test_unseen_category(cat_missing_method):
df = pd.DataFrame(
{
"cat_1": pd.Categorical(["a", "b"]),
}
)
df_unseen = pd.DataFrame(
{
"cat_1": pd.Categorical(["a", "b", "c"]),
}
)
result_seen = tm.from_formula(
"cat_1 - 1", df, cat_missing_method=cat_missing_method
)

with pytest.raises(ValueError, match="contains unseen categories"):
result_seen.model_spec.get_model_matrix(df_unseen)


@pytest.mark.parametrize("cat_missing_method", ["zero", "convert", "fail"])
def test_unseen_missing(cat_missing_method):
df = pd.DataFrame(
{
"cat_1": pd.Categorical(["a", "b"]),
}
)
df_unseen = pd.DataFrame(
{
"cat_1": pd.Categorical(["a", "b", pd.NA]),
}
)
result_seen = tm.from_formula(
"cat_1 - 1", df, cat_missing_method=cat_missing_method
)

if cat_missing_method == "convert":
with pytest.raises(ValueError, match="contains unseen categories"):
result_seen.model_spec.get_model_matrix(df_unseen)
elif cat_missing_method == "fail":
with pytest.raises(
ValueError, match="Categorical data can't have missing values"
):
result_seen.model_spec.get_model_matrix(df_unseen)
elif cat_missing_method == "zero":
result_unseen = result_seen.model_spec.get_model_matrix(df_unseen)
assert result_unseen.A.shape == (3, 2)
np.testing.assert_array_equal(
result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]])
)
assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"]


# Tests from formulaic's test suite
# ---------------------------------

Expand Down

0 comments on commit bd20e0d

Please sign in to comment.