Raise for unseen categories when materializing from an existing `Mode…

…lSpec` (#341) * Raise error on unseen levels when materializing * Fix test for unseen categories * Add test for raising on unseen categories * Properly handle missings when checking for unseen * Expand test for unseen missings * Improve attribute name * Add comment about dropping missings in tests for new levels
Quantco · Jan 25, 2024 · bd20e0d · bd20e0d
1 parent 249b5e5
commit bd20e0d
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 5 deletions.
diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py
@@ -429,7 +429,7 @@ def from_categorical(
         reduced_rank: bool,
         missing_method: str = "fail",
         missing_name: str = "(MISSING)",
-        force_convert: bool = False,
+        add_missing_category: bool = False,
     ) -> "_InteractableCategoricalVector":
         """Create an interactable categorical vector from a pandas categorical."""
         categories = list(cat.categories)
@@ -446,7 +446,7 @@ def from_categorical(
                 "if cat_missing_method='fail'."
             )
 
-        if missing_method == "convert" and (-1 in codes or force_convert):
+        if missing_method == "convert" and (-1 in codes or add_missing_category):
             codes[codes == -1] = len(categories)
             categories.append(missing_name)
 
@@ -723,17 +723,35 @@ def encode_contrasts(
             order to avoid spanning the intercept.
     """
     levels = levels if levels is not None else _state.get("categories")
-    force_convert = _state.get("force_convert", False)
+    add_missing_category = _state.get("add_missing_category", False)
+
+    # Check for unseen categories when levels are specified
+    if levels is not None:
+        if missing_method == "convert" and not add_missing_category:
+            # We only need to include NAs in the check in this case because:
+            #  - missing_method == "fail" raises a more appropriate error later
+            #  - missings are no problem in the other cases
+            unseen_categories = set(data.unique()) - set(levels)
+        else:
+            unseen_categories = set(data.dropna().unique()) - set(levels)
+
+        if unseen_categories:
+            raise ValueError(
+                f"Column {data.name} contains unseen categories: {unseen_categories}."
+            )
+
     cat = pandas.Categorical(data._values, categories=levels)
     _state["categories"] = cat.categories
-    _state["force_convert"] = missing_method == "convert" and cat.isna().any()
+    _state["add_missing_category"] = add_missing_category or (
+        missing_method == "convert" and cat.isna().any()
+    )
 
     return _InteractableCategoricalVector.from_categorical(
         cat,
         reduced_rank=reduced_rank,
         missing_method=missing_method,
         missing_name=missing_name,
-        force_convert=force_convert,
+        add_missing_category=add_missing_category,
     )
 
 

diff --git a/tests/test_formula.py b/tests/test_formula.py
@@ -746,6 +746,61 @@ def test_cat_missing_interactions():
     assert tm.from_formula(formula, df).column_names == expected_names
 
 
+@pytest.mark.parametrize(
+    "cat_missing_method", ["zero", "convert", "fail"], ids=["zero", "convert", "fail"]
+)
+def test_unseen_category(cat_missing_method):
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(["a", "b"]),
+        }
+    )
+    df_unseen = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(["a", "b", "c"]),
+        }
+    )
+    result_seen = tm.from_formula(
+        "cat_1 - 1", df, cat_missing_method=cat_missing_method
+    )
+
+    with pytest.raises(ValueError, match="contains unseen categories"):
+        result_seen.model_spec.get_model_matrix(df_unseen)
+
+
+@pytest.mark.parametrize("cat_missing_method", ["zero", "convert", "fail"])
+def test_unseen_missing(cat_missing_method):
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(["a", "b"]),
+        }
+    )
+    df_unseen = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(["a", "b", pd.NA]),
+        }
+    )
+    result_seen = tm.from_formula(
+        "cat_1 - 1", df, cat_missing_method=cat_missing_method
+    )
+
+    if cat_missing_method == "convert":
+        with pytest.raises(ValueError, match="contains unseen categories"):
+            result_seen.model_spec.get_model_matrix(df_unseen)
+    elif cat_missing_method == "fail":
+        with pytest.raises(
+            ValueError, match="Categorical data can't have missing values"
+        ):
+            result_seen.model_spec.get_model_matrix(df_unseen)
+    elif cat_missing_method == "zero":
+        result_unseen = result_seen.model_spec.get_model_matrix(df_unseen)
+        assert result_unseen.A.shape == (3, 2)
+        np.testing.assert_array_equal(
+            result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]])
+        )
+        assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"]
+
+
 # Tests from formulaic's test suite
 # ---------------------------------