diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index c1198c72..0afe6df1 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -429,7 +429,7 @@ def from_categorical( reduced_rank: bool, missing_method: str = "fail", missing_name: str = "(MISSING)", - force_convert: bool = False, + add_missing_category: bool = False, ) -> "_InteractableCategoricalVector": """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) @@ -446,7 +446,7 @@ def from_categorical( "if cat_missing_method='fail'." ) - if missing_method == "convert" and (-1 in codes or force_convert): + if missing_method == "convert" and (-1 in codes or add_missing_category): codes[codes == -1] = len(categories) categories.append(missing_name) @@ -723,17 +723,35 @@ def encode_contrasts( order to avoid spanning the intercept. """ levels = levels if levels is not None else _state.get("categories") - force_convert = _state.get("force_convert", False) + add_missing_category = _state.get("add_missing_category", False) + + # Check for unseen categories when levels are specified + if levels is not None: + if missing_method == "convert" and not add_missing_category: + # We only need to include NAs in the check in this case because: + # - missing_method == "fail" raises a more appropriate error later + # - missings are no problem in the other cases + unseen_categories = set(data.unique()) - set(levels) + else: + unseen_categories = set(data.dropna().unique()) - set(levels) + + if unseen_categories: + raise ValueError( + f"Column {data.name} contains unseen categories: {unseen_categories}." + ) + cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories - _state["force_convert"] = missing_method == "convert" and cat.isna().any() + _state["add_missing_category"] = add_missing_category or ( + missing_method == "convert" and cat.isna().any() + ) return _InteractableCategoricalVector.from_categorical( cat, reduced_rank=reduced_rank, missing_method=missing_method, missing_name=missing_name, - force_convert=force_convert, + add_missing_category=add_missing_category, ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 44213964..2cfefa02 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -746,6 +746,61 @@ def test_cat_missing_interactions(): assert tm.from_formula(formula, df).column_names == expected_names +@pytest.mark.parametrize( + "cat_missing_method", ["zero", "convert", "fail"], ids=["zero", "convert", "fail"] +) +def test_unseen_category(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", "c"]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + + +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert", "fail"]) +def test_unseen_missing(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", pd.NA]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + + if cat_missing_method == "convert": + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + elif cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + result_seen.model_spec.get_model_matrix(df_unseen) + elif cat_missing_method == "zero": + result_unseen = result_seen.model_spec.get_model_matrix(df_unseen) + assert result_unseen.A.shape == (3, 2) + np.testing.assert_array_equal( + result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]]) + ) + assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"] + + # Tests from formulaic's test suite # ---------------------------------