diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index c850cde7b2..2d48bc96ba 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -303,3 +303,12 @@ Variable types Ordinal Boolean Text + + +Feature Selection +------------------ +.. currentmodule:: featuretools.selection +.. autosummary:: + :toctree: generated/ + + remove_low_information_features diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c2a47cdace..4341f90748 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -3,6 +3,17 @@ Changelog --------- +**v0.1.11** October 31, 2017 + * Package linting (:pr:`7`) + * Custom primitive creation functions (:pr:`13`) + * Split requirements to separate files and pin to latest versions (:pr:`15`) + * Select low information features (:pr:`18`) + * Fix docs typos (:pr:`19`) + * Fixed Diff primitive for rare nan case (:pr:`21`) + * added some mising doc strings (:pr:`23`) + * Trend fix (:pr:`22`) + * Remove as_dir=False option from EntitySet.to_pickle() (:pr:`20`) + * Entity Normalization Preserves Types of Copy & Additional Variables (:pr:`25`) **v0.1.10** October 12, 2017 * NumTrue primitive added and docstring of other primitives updated (:pr:`11`) @@ -47,3 +58,6 @@ Changelog **v0.1.2** July 3rd, 2017 * Initial release + +.. command +.. git log --pretty=oneline --abbrev-commit diff --git a/featuretools/__init__.py b/featuretools/__init__.py index 2ed6ac934c..4bc2934c29 100644 --- a/featuretools/__init__.py +++ b/featuretools/__init__.py @@ -11,4 +11,4 @@ from .utils.pickle_utils import * import featuretools.demo -__version__ = '0.1.10' +__version__ = '0.1.11' diff --git a/featuretools/selection/api.py b/featuretools/selection/api.py index 22321dc2c7..e52d0d9792 100644 --- a/featuretools/selection/api.py +++ b/featuretools/selection/api.py @@ -1,2 +1,2 @@ # flake8: noqa -from .variance_selection import * +from .selection import * diff --git a/featuretools/selection/selection.py b/featuretools/selection/selection.py new file mode 100644 index 0000000000..755af73668 --- /dev/null +++ b/featuretools/selection/selection.py @@ -0,0 +1,18 @@ +def remove_low_information_features(feature_matrix, features=None): + ''' + Select features that have at least 2 unique values and that are not all null + Args: + feature_matrix (:class:`pd.DataFrame`): DataFrame whose columns are + feature names and rows are instances + features (list[:class:`featuretools.PrimitiveBase`] or list[str], optional): + List of features to select + ''' + keep = [c for c in feature_matrix + if (feature_matrix[c].nunique(dropna=False) > 1 and + feature_matrix[c].dropna().shape[0] > 0)] + feature_matrix = feature_matrix[keep] + if features is not None: + features = [f for f in features + if f.get_name() in feature_matrix.columns] + return feature_matrix, features + return feature_matrix diff --git a/featuretools/selection/variance_selection.py b/featuretools/selection/variance_selection.py deleted file mode 100644 index c8b3cf940d..0000000000 --- a/featuretools/selection/variance_selection.py +++ /dev/null @@ -1,143 +0,0 @@ -import pandas as pd - -from featuretools.variable_types import Discrete, Numeric - - -def plot_feature_variances(feature_matrix, - features, - low_threshold=None, - high_threshold=None, - log_plot=True, - kind='hist', - print_description=True): - ''' Plot to find a good "knee"/threshold at which to select high variance features - Only applies to numeric features - Thresholds indicate coefficient of variation (standard deviation divided by mean)''' - numeric_features = [f.get_name() for f in features if issubclass(f.variable_type, Numeric)] - stds = feature_matrix[numeric_features].std(axis=0, skipna=True) - means = feature_matrix[numeric_features].mean(axis=0, skipna=True) - cvs = (stds / means).abs() - xlim = [0, cvs.max()] - if low_threshold is not None: - cvs = cvs[cvs > low_threshold] - xlim[0] = low_threshold - if high_threshold is not None: - cvs = cvs[cvs < high_threshold] - xlim[1] = high_threshold - if print_description: - print "Stats about the range of the coefficient of variation across all features" - print cvs.describe() - cvs.plot(kind=kind, xlim=tuple(xlim), logx=log_plot) - return cvs - - -def get_categorical_nunique_ratio(df, drop_nonumeric=True): - if drop_nonumeric: - numeric_columns = df.head()._get_numeric_data().columns - nonnumeric_columns = [f for f in df if f not in numeric_columns] - df = df[nonnumeric_columns] - else: - nonnumeric_columns = df.columns - - nunique = df.nunique(axis=0, dropna=True) - total = df[nonnumeric_columns].count(axis=0) - return nunique / total - - -def plot_categorical_nunique_ratio(feature_matrix, - low_threshold=None, - high_threshold=None, - log_plot=False, - print_description=True): - ''' Plot to find a good "knee"/threshold at which to select categorical features - with a high ratio of nunique to total elements''' - ratio = get_categorical_nunique_ratio(feature_matrix) - - if low_threshold is not None: - ratio = ratio[ratio > low_threshold] - if high_threshold is not None: - ratio = ratio[ratio < high_threshold] - if print_description: - print ratio.describe() - return ratio.plot(kind='kde', logx=log_plot) - - -def select_high_variance_features(feature_matrix, features=None, - cv_threshold=0, - categorical_nunique_ratio=None, keep=None): - ''' - Select features above a threshold coefficient of variation - (standard deviation divided by mean). - By default excludes any non-numeric features. If - categorical_nunique_ratio is specified, will - select categorical features whose ratio of unique - elements to total number of nonnull elements - is greater than categorical_nunique_ratio - - Args: - feature_matrix (:class:`pd.DataFrame`): DataFrame whose columns are feature names and rows are instances - features (list[:class:`featuretools.PrimitiveBase`] or list[str], optional): List of features to select - cv_threshold (float): Select features above this coefficient of variation - categorical_nunique_ratio (float): Select categorical features whose ratio of unique - elements to total number of nonnull elements is greater than this parameter - keep (list[str]): list of feature names to force select - ''' - keep = keep or [] - if features: - numeric_features = [f.get_name() for f in features if issubclass(f.variable_type, Numeric)] - stds = feature_matrix[numeric_features].std(axis=0, skipna=True) - means = feature_matrix[numeric_features].mean(axis=0, skipna=True) - else: - stds = feature_matrix.std(axis=0, skipna=True, numeric_only=True) - means = feature_matrix.mean(axis=0, skipna=True, numeric_only=True) - cvs = stds / means - high_variances = cvs[cvs.abs() > cv_threshold] - if features is None: - high_variance_feature_names = [f for f in feature_matrix.columns if f in high_variances or f in keep] - else: - high_variance_features = [f for f in features if f.get_name() in high_variances.index or f.get_name() in keep] - high_variance_feature_names = [f.get_name() for f in high_variance_features] - - high_variance_feature_matrix = feature_matrix[high_variance_feature_names] - if categorical_nunique_ratio is not None: - if features is not None: - discrete_features = [f.get_name() for f in features if issubclass(f.variable_type, Discrete)] - ratio = get_categorical_nunique_ratio(feature_matrix[discrete_features], drop_nonumeric=False) - else: - ratio = get_categorical_nunique_ratio(feature_matrix) - - high_ratio = ratio[ratio > categorical_nunique_ratio] - if features is None: - high_cat_feature_names = [f for f in feature_matrix if f in high_ratio.index] - else: - high_cat_features = [f for f in features if f.get_name() in high_ratio.index] - high_cat_feature_names = [f.get_name() for f in high_cat_features] - high_variance_features += high_cat_features - high_cat_fm = feature_matrix[high_cat_feature_names] - high_variance_feature_matrix = pd.concat([high_variance_feature_matrix, high_cat_fm], axis=1) - if features is None: - return high_variance_feature_matrix - else: - return high_variance_feature_matrix, high_variance_features - - -def select_percent_null(feature_matrix, features, max_null_percent=1.0, keep=None): - '''Select features where the percentage of null values is below max_null_percent - - Args: - feature_matrix (:class:`pd.DataFrame`): DataFrame whose columns are feature names and rows are instances - features (list[:class:`featuretools.PrimitiveBase`] or list[str], optional): List of features to select - max_null_percent (float): Select features below this - keep (list[str]): list of feature names to force select - ''' - keep = keep or [] - - null_counts = feature_matrix.isnull().sum() - null_percents = null_counts / feature_matrix.shape[0] - - low_nulls = null_percents[null_percents < max_null_percent] - - low_nulls_features = [f for f in features if f.get_name() in low_nulls.index or f.get_name() in keep] - low_nulls_feature_names = [f.get_name() for f in low_nulls_features] - - return feature_matrix[low_nulls_feature_names], low_nulls_features diff --git a/featuretools/tests/selection/test_selection.py b/featuretools/tests/selection/test_selection.py new file mode 100644 index 0000000000..8d190b46d2 --- /dev/null +++ b/featuretools/tests/selection/test_selection.py @@ -0,0 +1,44 @@ +from featuretools.selection import remove_low_information_features +from featuretools.tests.testing_utils import make_ecommerce_entityset +from featuretools import Feature +import numpy as np +import pandas as pd +import pytest + + +@pytest.fixture(scope='module') +def feature_matrix(): + feature_matrix = pd.DataFrame({'test': [0, 1, 2], + 'no_null': [np.nan, 0, 0], + 'some_null': [np.nan, 0, 0], + 'all_null': [np.nan, np.nan, np.nan], + 'many_value': [1, 2, 3], + 'dup_value': [1, 1, 2], + 'one_value': [1, 1, 1]}) + return feature_matrix + + +@pytest.fixture(scope='module') +def es(feature_matrix): + es = make_ecommerce_entityset() + es.entity_from_dataframe('test', feature_matrix, index='test') + return es + + +def test_remove_low_information_feature_names(feature_matrix): + feature_matrix = remove_low_information_features(feature_matrix) + assert feature_matrix.shape == (3, 5) + assert 'one_value' not in feature_matrix.columns + assert 'all_null' not in feature_matrix.columns + + +def test_remove_low_information_features(es, feature_matrix): + features = [Feature(v) for v in es['test'].variables] + feature_matrix, features = remove_low_information_features(feature_matrix, + features) + assert feature_matrix.shape == (3, 5) + assert len(features) == 5 + for f in features: + assert f.get_name() in feature_matrix.columns + assert 'one_value' not in feature_matrix.columns + assert 'all_null' not in feature_matrix.columns diff --git a/featuretools/tests/selection/test_variance_selection.py b/featuretools/tests/selection/test_variance_selection.py deleted file mode 100644 index cd71ccd454..0000000000 --- a/featuretools/tests/selection/test_variance_selection.py +++ /dev/null @@ -1,42 +0,0 @@ -import pandas as pd -import pytest - -from featuretools import Feature -from featuretools.selection import select_high_variance_features -from featuretools.tests.testing_utils import make_ecommerce_entityset - - -@pytest.fixture(scope='module') -def feature_matrix(): - feature_matrix = pd.DataFrame({'numeric_low': [0, 0, 0], - 'numeric_high': [0, 100, 200], - 'numeric_high_low_cv': [9500, 10000, 10500], - 'categorical_low': ['test', 'test', 'test2'], - 'categorical_high': ['test1', 'test2', 'test3']}) - return feature_matrix - - -@pytest.fixture(scope='module') -def es(feature_matrix): - es = make_ecommerce_entityset() - es.entity_from_dataframe('test', feature_matrix, index='test', make_index=True) - return es - - -def test_select_high_variance_feature_names(feature_matrix): - feature_matrix = select_high_variance_features(feature_matrix, cv_threshold=0.5, categorical_nunique_ratio=.7) - assert feature_matrix.shape == (3, 2) - assert 'numeric_high' in feature_matrix.columns - assert 'categorical_high' in feature_matrix.columns - - -def test_select_high_variance_features(es, feature_matrix): - features = [Feature(v) for v in es['test'].variables] - feature_matrix, features = select_high_variance_features(feature_matrix, features, cv_threshold=0.5, categorical_nunique_ratio=.7) - assert feature_matrix.shape == (3, 2) - assert len(features) == 2 - for f in features: - assert f.get_name() in feature_matrix.columns - - assert 'numeric_high' in feature_matrix.columns - assert 'categorical_high' in feature_matrix.columns diff --git a/setup.py b/setup.py index d17cd2d115..a05496c949 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def finalize_options(self): setup( name='featuretools', - version='0.1.10', + version='0.1.11', packages=find_packages(), description='a framework for automated feature engineering', url='http://featuretools.com',