strongio · andywong36 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py
@@ -15,7 +15,7 @@
 
 
 class Binned:
-    def __init__(self, col: str, bins: Union[int, Sequence] = 20, **kwargs):
+    def __init__(self, col: str, bins: Union[None, int, Sequence] = 20, **kwargs):
         """
         This class creates an object which can bin a pandas.Series.
         ```
@@ -504,26 +504,52 @@ def _get_binned_feature_map(X: pd.DataFrame,
                                 aggfun: Union[str, Callable]) -> pd.DataFrame:
         """
         Get a dataframe that maps the binned version of a feature to the aggregates of its original values.
+
+        :param X: A dataframe which contains the columns binned_fname and fname
+        :param binned_fname: The column name of the binned data
+        :param fname: The column name of the unbinned data
+        :param aggfun: the aggregation of X[fname] based on grouping by binned_fname. The special case of 'mid' will use
+        the midpoint of the bins in X[binned_fname]. In the case that there are no actual values in a bin to aggregate, the midpoint
+        of the bin will be used.
+
+        :returns: a pd.DataFrame with columns [binned_fname, fname]. The returned[fname] will contain the aggregated values.
+        :raises ValueError: if fname and binned_fname are the same
+        :raises ValueError: if there are inf or na in the resulting aggregated values.
         """
-        assert binned_fname != fname
+        if binned_fname == fname:
+            raise ValueError("binned_fname and fname cannot be the same column.")
 
         if aggfun == 'mid':
-            # creates a df with unique values of `binned_fname` and `nans` for `fname`.
-            # this will then get filled with the midpoint below:
-            # todo: less hacky way to do this
-            df_mapping = X.groupby(binned_fname, observed=False)[fname].agg('count').reset_index()
-            df_mapping[fname] = float('nan')
-        else:
-            df_mapping = X.groupby(binned_fname, observed=False)[fname].agg(aggfun).reset_index()
-
-        # for any bins that aren't actually observed, use the midpoint:
-        midpoints = pd.Series([x.mid for x in df_mapping[binned_fname]])
-        if np.isinf(midpoints).any() and df_mapping[fname].isnull().any():
-            raise ValueError(
-                f"[{fname}] `inf` bin cuts cannot be used when no data present in the bin:"
-                f"{df_mapping[binned_fname][np.isinf(midpoints)]}"
+            aggfun = lambda series: series.name.mid
+
+        df_mapping = (
+            X
+            .groupby(
+                binned_fname,
+                group_keys=True,
+                observed=False,
+                as_index=False,
             )
-        df_mapping[fname].fillna(midpoints, inplace=True)
+            [fname]
+            .apply(aggfun)
+            .assign(**{
+                fname: lambda df: (
+                    df
+                    [fname]
+                    .fillna(
+                        df
+                        [binned_fname]
+                        .map(lambda interval: interval.mid)
+                        .astype(float)
+                    )
+                )
+            })
+        )
+
+        with pd.option_context("mode.use_inf_as_na", True):
+            if df_mapping[fname].isna().any():
+                raise ValueError(f"aggfun resulted in invalid values: \n {df_mapping}")
+
         return df_mapping
 
     def _get_df_novary(self,

diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py
@@ -1,13 +1,15 @@
 from typing import Callable
-import pandas as pd
+from unittest.mock import create_autospec
+
 import numpy as np
+import pandas as pd
 import pytest
-from unittest.mock import create_autospec
-from pandas.testing import assert_series_equal
+from foundry.evaluation.marginal_effects import (Binned, MarginalEffects,
+                                                 binned, raw)
+from pandas.testing import assert_frame_equal, assert_series_equal
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 
-from foundry.evaluation.marginal_effects import Binned, MarginalEffects, binned, raw
 
 class TestBinned():
     @pytest.mark.parametrize(
@@ -72,7 +74,7 @@ def test_binned_init(self, bins):
                 )
             ),
             (
-                False,
+                None,
                 pd.Series(list(range(20)), name="my_feature")
             )
         ],
@@ -133,3 +135,89 @@ def test_feature_names_in(self, col_transformer__columns, expected):
 
         assert isinstance(me.feature_names_in, list)
         assert list(sorted(expected)) == list(sorted(me.feature_names_in))
+
+    binned_col_A = pd.Series(
+        [
+            pd.Interval(0.999, 2.0),
+            pd.Interval(2.0, 3.0),
+        ],
+        dtype=pd.CategoricalDtype(
+            categories=[
+                pd.Interval(0.999, 2.0),
+                pd.Interval(2.0, 3.0)
+            ],
+            ordered=True
+        ),
+        name="binnedA"
+    )
+
+    @pytest.mark.parametrize(
+        argnames=["aggfun", "expected"],
+        argvalues=[
+            (
+                "mid",
+                pd.DataFrame({"binnedA": binned_col_A, "colA": [1.4995, 2.5]})
+            ),
+            (
+                "min",
+                pd.DataFrame({"binnedA": binned_col_A, "colA": [1, 3]})
+            ),
+            (
+                np.median,
+                pd.DataFrame({"binnedA": binned_col_A, "colA": [1.5, 3.0]})
+            ),
+        ]
+    )
+    def test__get_binned_feature_map(self, aggfun, expected):
+        df = (
+            self.x_data
+            .assign(
+                **{
+                    "binnedA": [
+                        pd.Interval(0.999, 2.0),
+                        pd.Interval(0.999, 2.0),
+                        pd.Interval(2.0, 3.0),
+                    ],
+                },
+            )
+            .astype({"binnedA": self.binned_col_A.dtype})
+        )
+
+        test = MarginalEffects._get_binned_feature_map(
+            df,
+            "binnedA",
+            "colA",
+            aggfun=aggfun,
+        )
+
+        print(test.dtypes, expected.dtypes)
+        assert_frame_equal(test, expected)
+
+    def test__get_binned_feature_map_empty_bins(self):
+        df = (
+            self.x_data
+            .assign(
+                **{
+                    "binnedA": pd.Categorical(
+                        [
+                            pd.Interval(0.999, 2.0),
+                            pd.Interval(0.999, 2.0),
+                            pd.Interval(2.0, 3.0),
+                        ],
+                        categories=[
+                            pd.Interval(-np.inf, 0.999),
+                            pd.Interval(0.999, 2.0),
+                            pd.Interval(2.0, 3.0)
+                        ],
+                    )
+                },
+            )
+        )
+
+        with pytest.raises(ValueError):
+            MarginalEffects._get_binned_feature_map(
+                df,
+                "binnedA",
+                "colA",
+                "median",
+            )
diff --git a/tests/preprocessing/sklearn/test_dataframe_transformer.py b/tests/preprocessing/sklearn/test_dataframe_transformer.py
@@ -19,7 +19,7 @@ class TestDataFrameTransformer:
             (np.zeros((3, 2)), pd.DataFrame(np.zeros((3, 2)))),
             # convert sparse:
             (
-                    OneHotEncoder(sparse=True).fit_transform([['a'], ['b'], ['c'], ['d']]),
+                    OneHotEncoder(sparse_output=True).fit_transform([['a'], ['b'], ['c'], ['d']]),
                     pd.DataFrame(np.eye(4))
             )
         ]