Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding make_column_transformer #14

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion searchgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import defaultdict as _defaultdict
import itertools as _itertools

from sklearn.compose import ColumnTransformer as _ColumnTransformer
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this code scikit-learn 0.19-compatible by importing this in make_column_transformer and skipping the corresponding tests in old versions.

from sklearn.model_selection import GridSearchCV as _GridSearchCV
from sklearn.pipeline import Pipeline as _Pipeline
from sklearn.pipeline import FeatureUnion as _FeatureUnion
Expand Down Expand Up @@ -147,7 +148,7 @@ def _name_steps(steps, default='alt'):
if len(estimators) > 1:
while None in estimators:
estimators.remove(None)
step_names = {type(estimator).__name__.lower()
step_names = {_name_of_estimator(estimator)
for estimator in estimators}
if len(step_names) > 1:
names.append(default)
Expand All @@ -173,6 +174,19 @@ def _name_steps(steps, default='alt'):
return named_steps, grid


def _name_of_estimator(estimator):
if isinstance(estimator, tuple):
# tuples comes from ColumnTransformers. At the moment, sklearn accepts
# both (estimator, list_of_columns) and (list_of_columns, estimator)
tuple_types = {type(tuple_entry) for tuple_entry in estimator}
tuple_types.discard(list)
estimator_type = tuple_types.pop()
else:
estimator_type = type(estimator)

return estimator_type.__name__.lower()


def make_pipeline(*steps, **kwargs):
"""Construct a Pipeline with alternative estimators to search over

Expand Down Expand Up @@ -257,3 +271,33 @@ def make_union(*transformers, **kwargs):
"""
steps, grid = _name_steps(transformers)
return set_grid(_FeatureUnion(steps, **kwargs), **grid)


def make_column_transformer(*transformers, **kwargs):
"""Construct a ColumnTransformer with alternative estimators to search over

Parameters
----------
steps
Each step is specified as one of:

* an (estimator, [column_names]) or ([column_names], estimator) tuple
* None (meaning no features)
* a list of the above, indicating that a grid search should alternate
over the estimators (or None) in the list
kwargs
Keyword arguments to the constructor of
:class:`sklearn.pipeline.FeatureUnion`.

Notes
-----
Each step is named according to the set of estimator types in its list:

* if a step has only one type of estimator (disregarding None), it takes
that estimator's class name (lowercased)
* if a step has estimators of mixed type, the step is named 'alt'
* if there are multiple steps of the same name using the above rules,
a suffix '-1', '-2', etc. is added.
"""
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a usage example.

steps, grid = _name_steps(transformers)
return set_grid(_ColumnTransformer(steps, **kwargs), **grid)
35 changes: 34 additions & 1 deletion test_searchgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.datasets import load_iris
from searchgrid import set_grid, build_param_grid, make_grid_search
from searchgrid import make_pipeline, make_union
from searchgrid import make_column_transformer, make_pipeline, make_union


@pytest.mark.parametrize(('estimator', 'param_grid'), [
Expand Down Expand Up @@ -111,3 +111,36 @@ def test_make_pipeline():
assert type(pipe) is Pipeline
assert type(union) is FeatureUnion
assert pipe.memory == '/path/to/nowhere'


def test_make_column_transformer():
t1 = (SelectKBest(), ['column1'])
t2 = (SelectKBest(), ['column2a', 'column2b'])
t3 = (SelectKBest(), ['column3'])
t4 = (SelectKBest(), ['column4'])
t5 = (SelectPercentile(), ['column5'])
t6 = (SelectKBest(), ['column6a', 'column6b'])
t7 = (SelectKBest(), ['column7'])
t8 = (SelectKBest(), ['column8'])
t9 = (SelectPercentile(), ['column9'])

in_steps = [[t1, None],
[t2, t3],
[t4, t5], # mixed
t6,
[None, t7],
[t8, None, t9], # mixed
None]
column_transformer = make_column_transformer(*in_steps)
names, steps = zip(*column_transformer.transformers)

assert names == ('selectkbest-1', 'selectkbest-2', 'alt-1',
'selectkbest-3', 'selectkbest-4', 'alt-2', 'nonetype')
assert steps == (t1, t2, t4, t6, None, t8, None)

assert len(column_transformer._param_grid) == 5
assert column_transformer._param_grid[names[0]] == [t1, None]
assert column_transformer._param_grid[names[1]] == [t2, t3]
assert column_transformer._param_grid[names[2]] == [t4, t5]
assert column_transformer._param_grid[names[4]] == [None, t7]
assert column_transformer._param_grid[names[5]] == [t8, None, t9]