diff --git a/R-package/R/abesspca.R b/R-package/R/abesspca.R index e7e57ca8..ccc68731 100644 --- a/R-package/R/abesspca.R +++ b/R-package/R/abesspca.R @@ -235,7 +235,7 @@ abesspca <- function(x, exchange_num = c_max, path_type = path_type, is_warm_start = warm.start, - ic_type = 1, + ic_type = ic_type, ic_coef = ic_scale, Kfold = nfolds, sequence = s_list_bool, diff --git a/R-package/R/utility.R b/R-package/R/utility.R index 62449990..d49ba626 100644 --- a/R-package/R/utility.R +++ b/R-package/R/utility.R @@ -82,7 +82,7 @@ map_tunetype2numeric <- function(tune.type) { "bic" = 2, "gic" = 3, "ebic" = 4, - "cv" = 1 + "cv" = 0 ) ic_type } diff --git a/python/abess/bess_base.py b/python/abess/bess_base.py index a1fd821a..262d4d43 100644 --- a/python/abess/bess_base.py +++ b/python/abess/bess_base.py @@ -40,7 +40,7 @@ class bess_base(BaseEstimator): - If alpha = 0, it indicates ordinary least square. - ic_type : {'aic', 'bic', 'gic', 'ebic'}, optional, default='ebic' + ic_type : {'aic', 'bic', 'gic', 'ebic', 'loss'}, optional, default='ebic' The type of criterion for choosing the support size if `cv=1`. ic_coef : float, optional, default=1.0 Constant that controls the regularization strength @@ -52,6 +52,14 @@ class bess_base(BaseEstimator): - If cv>1, support size will be chosen by CV's test loss, instead of IC. + cv_score : {'test_loss', ...}, optional, default='test_loss' + The score used on test data for CV. + + - All methods support {'test_loss'}. + - LogisticRegression also supports {'roc_auc'}. + - MultinomialRegression also supports {'roc_auc_ovo', 'roc_auc_ovr'}, + which indicate "One vs One/Rest" algorithm, respectively. + thread : int, optional, default=1 Max number of multithreads. @@ -131,6 +139,7 @@ def __init__( ic_type="ebic", ic_coef=1.0, cv=1, + cv_score="test_loss", thread=1, A_init=None, always_select=None, @@ -170,6 +179,7 @@ def __init__( self.ic_type = ic_type self.ic_coef = ic_coef self.cv = cv + self.cv_score = cv_score self.screening_size = screening_size self.always_select = always_select self.primary_model_fit_max_iter = primary_model_fit_max_iter @@ -323,28 +333,49 @@ def fit(self, else: raise ValueError("path_type should be \'seq\' or \'gs\'") - # Ic_type: aic, bic, gic, ebic - if self.ic_type == "aic": - ic_type_int = 1 - elif self.ic_type == "bic": - ic_type_int = 2 - elif self.ic_type == "gic": - ic_type_int = 3 - elif self.ic_type == "ebic": - ic_type_int = 4 - elif self.ic_type == "hic": - ic_type_int = 5 - else: - raise ValueError( - "ic_type should be \"aic\", \"bic\", \"ebic\"," - " \"gic\" or \"hic\".") - # cv if (not isinstance(self.cv, int) or self.cv <= 0): raise ValueError("cv should be an positive integer.") if self.cv > n: raise ValueError("cv should be smaller than n.") + # Ic_type: aic, bic, gic, ebic + # cv_score: test_loss, roc_auc + if self.cv == 1: + if self.ic_type == "loss": + eval_type_int = 0 + elif self.ic_type == "aic": + eval_type_int = 1 + elif self.ic_type == "bic": + eval_type_int = 2 + elif self.ic_type == "gic": + eval_type_int = 3 + elif self.ic_type == "ebic": + eval_type_int = 4 + elif self.ic_type == "hic": + eval_type_int = 5 + else: + raise ValueError( + "ic_type should be \"aic\", \"bic\", \"ebic\"," + " \"gic\" or \"hic\".") + else: + if self.cv_score == "test_loss": + eval_type_int = 0 + elif self.cv_score == "roc_auc" and self.model_type == "Logistic": + eval_type_int = 1 + elif (self.cv_score == "roc_auc_ovo" and + self.model_type == "Multinomial"): + eval_type_int = 2 + elif (self.cv_score == "roc_auc_ovr" and + self.model_type == "Multinomial"): + eval_type_int = 3 + else: + raise ValueError( + "cv_score should be \"test_loss\", " + "\"roc_auc\"(for logistic), " + "\"roc_auc_ovo\"(for multinomial), or " + "\"roc_auc_ovr\"(for multinomial).") + # cv_fold_id if cv_fold_id is None: cv_fold_id = np.array([], dtype="int32") @@ -561,7 +592,7 @@ def fit(self, X, y, sample_weight, n, p, normalize, algorithm_type_int, model_type_int, self.max_iter, self.exchange_num, path_type_int, - self.is_warm_start, ic_type_int, self.ic_coef, self.cv, + self.is_warm_start, eval_type_int, self.ic_coef, self.cv, g_index, support_sizes, alphas, cv_fold_id, new_s_min, new_s_max, new_lambda_min, new_lambda_max, n_lambda, self.screening_size, diff --git a/python/abess/decomposition.py b/python/abess/decomposition.py index 02d87828..7619d568 100644 --- a/python/abess/decomposition.py +++ b/python/abess/decomposition.py @@ -41,6 +41,10 @@ class SparsePCA(bess_base): - If cv>1, support size will be chosen by CV's test loss, instead of IC. + cv_score : {'test_loss'}, optional, default='test_loss' + The score used on test data for CV. + Only 'test_loss' is supported for PCA now. + thread : int, optional, default=1 Max number of multithreads. @@ -125,8 +129,9 @@ class SparsePCA(bess_base): """ def __init__(self, support_size=None, group=None, - ic_type="loss", ic_coef=1.0, cv=1, thread=1, - A_init=None, always_select=None, + ic_type="loss", ic_coef=1.0, + cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=1, screening_size=-1, @@ -137,7 +142,7 @@ def __init__(self, support_size=None, group=None, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, # s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, thread=thread, @@ -264,23 +269,31 @@ def fit(self, X=None, y=None, is_normal=False, # model_type_int = 7 path_type_int = 1 - # Ic_type - if self.ic_type == "loss": - ic_type_int = 0 - elif self.ic_type == "aic": - ic_type_int = 1 - elif self.ic_type == "bic": - ic_type_int = 2 - elif self.ic_type == "gic": - ic_type_int = 3 - elif self.ic_type == "ebic": - ic_type_int = 4 - elif self.ic_type == "hic": - ic_type_int = 5 + # Ic_type: aic, bic, gic, ebic + # cv_score: test_loss, roc_auc + if self.cv == 1: + if self.ic_type == "loss": + eval_type_int = 0 + elif self.ic_type == "aic": + eval_type_int = 1 + elif self.ic_type == "bic": + eval_type_int = 2 + elif self.ic_type == "gic": + eval_type_int = 3 + elif self.ic_type == "ebic": + eval_type_int = 4 + elif self.ic_type == "hic": + eval_type_int = 5 + else: + raise ValueError( + "ic_type should be \"aic\", \"bic\", \"ebic\"," + " \"gic\" or \"hic\".") else: - raise ValueError( - "ic_type should be \"loss\", \"aic\", \"bic\"," - " \"ebic\", \"gic\" or \"hic\".") + if self.cv_score == "test_loss": + eval_type_int = 0 + else: + raise ValueError( + "cv_score should be \"test_loss\".") # cv if (not isinstance(self.cv, int) or self.cv <= 0): @@ -425,7 +438,7 @@ def fit(self, X=None, y=None, is_normal=False, n, p, normalize, Sigma, self.max_iter, self.exchange_num, path_type_int, self.is_warm_start, - ic_type_int, self.ic_coef, self.cv, + eval_type_int, self.ic_coef, self.cv, g_index, support_sizes, cv_fold_id, @@ -633,15 +646,15 @@ def fit(self, X, y=None, r=None, sparse_matrix=False): # Ic_type if self.ic_type == "aic": - ic_type_int = 1 + eval_type_int = 1 elif self.ic_type == "bic": - ic_type_int = 2 + eval_type_int = 2 elif self.ic_type == "gic": - ic_type_int = 3 + eval_type_int = 3 elif self.ic_type == "ebic": - ic_type_int = 4 + eval_type_int = 4 elif self.ic_type == "hic": - ic_type_int = 5 + eval_type_int = 5 else: raise ValueError( "ic_type should be \"aic\", \"bic\", \"ebic\", \"gic\", " @@ -769,7 +782,7 @@ def fit(self, X, y=None, r=None, sparse_matrix=False): X, n, p, normalize, self.max_iter, self.exchange_num, path_type_int, self.is_warm_start, - ic_type_int, self.ic_coef, + eval_type_int, self.ic_coef, g_index, support_sizes, alphas, diff --git a/python/abess/linear.py b/python/abess/linear.py index 5494f9d6..a4603845 100644 --- a/python/abess/linear.py +++ b/python/abess/linear.py @@ -10,6 +10,7 @@ except ImportError: from .functions import d2_tweedie_score + @ fix_docs class LogisticRegression(bess_base): r""" @@ -59,7 +60,8 @@ class LogisticRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="roc_auc", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -72,7 +74,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -218,7 +220,8 @@ class LinearRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -232,7 +235,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, thread=thread, covariance_update=covariance_update, @@ -340,7 +343,8 @@ class CoxPHSurvivalAnalysis(bess_base, BreslowEstimator): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -353,7 +357,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -494,7 +498,8 @@ class PoissonRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -507,7 +512,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -636,7 +641,8 @@ class MultiTaskRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -651,7 +657,7 @@ def __init__(self, path_type="seq", support_size=None, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, thread=thread, covariance_update=covariance_update, @@ -769,7 +775,8 @@ class MultinomialRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -782,7 +789,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -938,7 +945,8 @@ class GammaRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -951,7 +959,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -1082,7 +1090,8 @@ class OrdinalRegression(bess_base): def __init__(self, path_type="seq", support_size=None, s_min=None, s_max=None, group=None, alpha=None, - ic_type="ebic", ic_coef=1.0, cv=1, thread=1, A_init=None, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + thread=1, A_init=None, always_select=None, max_iter=20, exchange_num=5, is_warm_start=True, splicing_type=0, important_search=128, screening_size=-1, @@ -1095,7 +1104,7 @@ def __init__(self, path_type="seq", support_size=None, path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -1200,7 +1209,8 @@ class abessLogistic(LogisticRegression): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="roc_auc", + screening_size=-1, always_select=None, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-8, approximate_Newton=False, @@ -1215,7 +1225,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -1237,7 +1247,8 @@ class abessLm(LinearRegression): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, thread=1, covariance_update=False, A_init=None, @@ -1252,7 +1263,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, thread=thread, covariance_update=covariance_update, @@ -1271,7 +1282,8 @@ class abessCox(CoxPHSurvivalAnalysis): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-8, approximate_Newton=False, @@ -1286,7 +1298,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -1308,7 +1320,8 @@ class abessPoisson(PoissonRegression): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-8, thread=1, @@ -1322,7 +1335,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -1343,7 +1356,8 @@ class abessMultigaussian(MultiTaskRegression): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, thread=1, covariance_update=False, A_init=None, @@ -1356,7 +1370,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, thread=thread, covariance_update=covariance_update, @@ -1375,7 +1389,8 @@ class abessMultinomial(MultinomialRegression): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-8, # approximate_Newton=False, @@ -1390,7 +1405,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, @@ -1412,7 +1427,8 @@ class abessGamma(GammaRegression): def __init__(self, max_iter=20, exchange_num=5, path_type="seq", is_warm_start=True, support_size=None, alpha=None, s_min=None, s_max=None, - ic_type="ebic", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="ebic", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-8, @@ -1427,7 +1443,7 @@ def __init__(self, max_iter=20, exchange_num=5, path_type="seq", path_type=path_type, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, alpha=alpha, s_min=s_min, s_max=s_max, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, primary_model_fit_max_iter=primary_model_fit_max_iter, diff --git a/python/abess/pca.py b/python/abess/pca.py index ba1f8699..74d24a96 100644 --- a/python/abess/pca.py +++ b/python/abess/pca.py @@ -13,7 +13,8 @@ class abessPCA(SparsePCA): def __init__(self, max_iter=20, exchange_num=5, is_warm_start=True, support_size=None, - ic_type="loss", ic_coef=1.0, cv=1, screening_size=-1, + ic_type="loss", ic_coef=1.0, cv=1, cv_score="test_loss", + screening_size=-1, always_select=None, thread=1, A_init=None, @@ -25,7 +26,8 @@ def __init__(self, max_iter=20, exchange_num=5, max_iter=max_iter, exchange_num=exchange_num, is_warm_start=is_warm_start, support_size=support_size, - ic_type=ic_type, ic_coef=ic_coef, cv=cv, + ic_type=ic_type, ic_coef=ic_coef, + cv=cv, cv_score=cv_score, screening_size=screening_size, always_select=always_select, thread=thread, diff --git a/python/abess/utilities.py b/python/abess/utilities.py index 8072d10b..dc536cdb 100644 --- a/python/abess/utilities.py +++ b/python/abess/utilities.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + def fix_docs(cls): """ This function is to inherit the docstring from base class @@ -27,6 +28,7 @@ def fix_docs(cls): cls.__doc__ = full_doc return cls + def new_data_check(self, X, y=None, weights=None): """ Check new data for predicting, scoring or else. diff --git a/python/pytest/test_alg.py b/python/pytest/test_alg.py index a8a51cbc..6906dd5b 100644 --- a/python/pytest/test_alg.py +++ b/python/pytest/test_alg.py @@ -461,7 +461,7 @@ def test_PCA(): assert_nan(model6.coef_) # ic - for ic in ['aic', 'bic', 'ebic', 'gic', 'hic']: + for ic in ['loss', 'aic', 'bic', 'ebic', 'gic', 'hic']: model = abess.SparsePCA(support_size=support_size, ic_type=ic) model.fit(X, is_normal=False) @@ -533,7 +533,7 @@ def test_RPCA(): # model3.fit(X, r=r) # ic - for ic in ['aic', 'bic', 'ebic', 'gic']: + for ic in ['aic', 'bic', 'ebic', 'gic', 'hic']: model4 = abess.RobustPCA(support_size=s, ic_type=ic) model4.fit(X, r=r) diff --git a/python/pytest/test_check.py b/python/pytest/test_check.py index 418e0758..cb63e895 100644 --- a/python/pytest/test_check.py +++ b/python/pytest/test_check.py @@ -45,6 +45,14 @@ def test_base(): else: assert False + try: + model = abess.LinearRegression(cv=2, cv_score='other') + model.fit([[1], [2]], [1, 2]) + except ValueError as e: + print(e) + else: + assert False + # exchange_num try: model = abess.LinearRegression(exchange_num=-1) @@ -487,6 +495,14 @@ def test_pca(): else: assert False + try: + model1 = abess.SparsePCA(cv=2, cv_score='other') + model1.fit([[1], [2]]) + except ValueError as e: + print(e) + else: + assert False + try: model1 = abess.SparsePCA(cv=5) model1.fit([[1]]) diff --git a/python/pytest/test_flow.py b/python/pytest/test_flow.py index ae0edd7a..585d0342 100644 --- a/python/pytest/test_flow.py +++ b/python/pytest/test_flow.py @@ -131,11 +131,15 @@ def test_possible_input(): n = 100 p = 20 k = 5 + M = 3 # s_min = 0 s_max = 10 screen = 15 imp = 5 data = abess.make_glm_data(n=n, p=p, k=k, family='gaussian') + data2 = abess.make_glm_data(n=n, p=p, k=k, family='binomial') + data3 = abess.make_multivariate_glm_data( + n=n, p=p, k=k, M=M, family='multinomial') # alpha model = abess.LinearRegression(alpha=[0.1, 0.2, 0.3]) @@ -183,9 +187,15 @@ def test_possible_input(): assert len(set(group[nonzero])) == 2 # ic - for ic in ['aic', 'bic', 'ebic', 'gic', 'hic']: + for ic in ['loss', 'aic', 'bic', 'ebic', 'gic', 'hic']: model = abess.LinearRegression(ic_type=ic) model.fit(data.x, data.y) + for cv_score in ['test_loss', 'roc_auc']: + model = abess.LogisticRegression(cv_score=cv_score, cv=5) + model.fit(data2.x, data2.y) + for cv_score in ['test_loss', 'roc_auc_ovo', 'roc_auc_ovr']: + model = abess.MultinomialRegression(cv_score=cv_score, cv=5) + model.fit(data3.x, data3.y) # A_init model = abess.LinearRegression(A_init=[0, 1, 2]) diff --git a/python/setup.py b/python/setup.py index d021b6fa..785f9850 100644 --- a/python/setup.py +++ b/python/setup.py @@ -166,6 +166,7 @@ def copy_src(): # print("sys.platform output: {}".format(sys.platform)) # print("platform.processor() output: {}".format(platform.processor())) + need_clean_tree = copy_src() package_info = get_info() diff --git a/python/src/pywrap.cpp b/python/src/pywrap.cpp index 185cd4a5..2d8f2eec 100644 --- a/python/src/pywrap.cpp +++ b/python/src/pywrap.cpp @@ -8,8 +8,8 @@ std::tuple pywrap_GLM( Eigen::MatrixXd x_Mat, Eigen::MatrixXd y_Mat, Eigen::VectorXd weight_Vec, int n, int p, int normalize_type, - int algorithm_type, int model_type, int max_iter, int exchange_num, int path_type, bool is_warm_start, int ic_type, - double ic_coef, int Kfold, Eigen::VectorXi gindex_Vec, Eigen::VectorXi sequence_Vec, + int algorithm_type, int model_type, int max_iter, int exchange_num, int path_type, bool is_warm_start, + int eval_type, double ic_coef, int Kfold, Eigen::VectorXi gindex_Vec, Eigen::VectorXi sequence_Vec, Eigen::VectorXd lambda_sequence_Vec, Eigen::VectorXi cv_fold_id_Vec, int s_min, int s_max, double lambda_min, double lambda_max, int n_lambda, int screening_size, Eigen::VectorXi always_select_Vec, int primary_model_fit_max_iter, double primary_model_fit_epsilon, bool early_stop, bool approximate_Newton, @@ -17,8 +17,8 @@ std::tuple pywrap_GLM( Eigen::VectorXi A_init_Vec) { List mylist = abessGLM_API(x_Mat, y_Mat, n, p, normalize_type, weight_Vec, algorithm_type, model_type, max_iter, exchange_num, - path_type, is_warm_start, ic_type, ic_coef, Kfold, sequence_Vec, lambda_sequence_Vec, s_min, s_max, - lambda_min, lambda_max, n_lambda, screening_size, gindex_Vec, always_select_Vec, + path_type, is_warm_start, eval_type, ic_coef, Kfold, sequence_Vec, lambda_sequence_Vec, s_min, + s_max, lambda_min, lambda_max, n_lambda, screening_size, gindex_Vec, always_select_Vec, primary_model_fit_max_iter, primary_model_fit_epsilon, early_stop, approximate_Newton, thread, covariance_update, sparse_matrix, splicing_type, sub_search, cv_fold_id_Vec, A_init_Vec); @@ -60,12 +60,12 @@ std::tuple pywrap_GLM( std::tuple pywrap_PCA( Eigen::MatrixXd x_Mat, Eigen::VectorXd weight_Vec, int n, int p, int normalize_type, Eigen::MatrixXd sigma_Mat, - int max_iter, int exchange_num, int path_type, bool is_warm_start, int ic_type, double ic_coef, int Kfold, + int max_iter, int exchange_num, int path_type, bool is_warm_start, int eval_type, double ic_coef, int Kfold, Eigen::VectorXi gindex_Vec, Eigen::MatrixXi sequence_Mat, Eigen::VectorXi cv_fold_id_Vec, int s_min, int s_max, int screening_size, Eigen::VectorXi always_select_Vec, bool early_stop, int thread, bool sparse_matrix, int splicing_type, int sub_search, int pca_num, Eigen::VectorXi A_init_Vec) { List mylist = abessPCA_API(x_Mat, n, p, normalize_type, weight_Vec, sigma_Mat, max_iter, exchange_num, path_type, - is_warm_start, ic_type, ic_coef, Kfold, sequence_Mat, s_min, s_max, screening_size, + is_warm_start, eval_type, ic_coef, Kfold, sequence_Mat, s_min, s_max, screening_size, gindex_Vec, always_select_Vec, early_stop, thread, sparse_matrix, splicing_type, sub_search, cv_fold_id_Vec, pca_num, A_init_Vec); @@ -94,13 +94,13 @@ std::tuple pywrap_PCA( std::tuple pywrap_RPCA( Eigen::MatrixXd x_Mat, int n, int p, int normalize_type, int max_iter, int exchange_num, int path_type, - bool is_warm_start, int ic_type, double ic_coef, Eigen::VectorXi gindex_Vec, Eigen::VectorXi sequence_Vec, + bool is_warm_start, int eval_type, double ic_coef, Eigen::VectorXi gindex_Vec, Eigen::VectorXi sequence_Vec, Eigen::VectorXd lambda_sequence_Vec, int s_min, int s_max, double lambda_min, double lambda_max, int n_lambda, int screening_size, Eigen::VectorXi always_select_Vec, int primary_model_fit_max_iter, double primary_model_fit_epsilon, bool early_stop, int thread, bool sparse_matrix, int splicing_type, int sub_search, Eigen::VectorXi A_init_Vec) { List mylist = - abessRPCA_API(x_Mat, n, p, max_iter, exchange_num, path_type, is_warm_start, ic_type, ic_coef, sequence_Vec, + abessRPCA_API(x_Mat, n, p, max_iter, exchange_num, path_type, is_warm_start, eval_type, ic_coef, sequence_Vec, lambda_sequence_Vec, s_min, s_max, lambda_min, lambda_max, n_lambda, screening_size, primary_model_fit_max_iter, primary_model_fit_epsilon, gindex_Vec, always_select_Vec, early_stop, thread, sparse_matrix, splicing_type, sub_search, A_init_Vec); diff --git a/src/Metric.h b/src/Metric.h index a38df80c..a1933f2f 100644 --- a/src/Metric.h +++ b/src/Metric.h @@ -19,12 +19,16 @@ class Metric { public: bool is_cv; int Kfold; - int ic_type; + int eval_type; + double ic_coef; + + bool raise_warning = true; + // Eigen::Matrix cv_initial_model_param; // Eigen::Matrix cv_initial_coef0; - std::vector cv_initial_A; - std::vector cv_initial_I; + // std::vector cv_initial_A; + // std::vector cv_initial_I; std::vector train_mask_list; std::vector test_mask_list; @@ -40,13 +44,11 @@ class Metric { // std::vector> group_XTX_list; - double ic_coef; - Metric() = default; - Metric(int ic_type, double ic_coef = 1.0, int Kfold = 5) { + Metric(int eval_type, double ic_coef = 1.0, int Kfold = 5) { this->is_cv = Kfold > 1; - this->ic_type = ic_type; + this->eval_type = eval_type; this->Kfold = Kfold; this->ic_coef = ic_coef; if (is_cv) { @@ -211,6 +213,7 @@ class Metric { // } double ic(int train_n, int M, int N, Algorithm *algorithm) { + // information criterioin: for non-CV double loss; if (algorithm->model_type == 1 || algorithm->model_type == 5) { loss = train_n * @@ -218,46 +221,165 @@ class Metric { } else { loss = 2 * (algorithm->get_train_loss() - algorithm->lambda_level * algorithm->beta.cwiseAbs2().sum()); } - - if (ic_type == 0) { + // 0. only loss + if (this->eval_type == 0) { return loss; - } else if (ic_type == 1) { + } + // 1. AIC + if (this->eval_type == 1) { return loss + 2.0 * algorithm->get_effective_number(); - } else if (ic_type == 2) { + } + // 2. BIC + if (this->eval_type == 2) { return loss + this->ic_coef * log(double(train_n)) * algorithm->get_effective_number(); - } else if (ic_type == 3) { + } + // 3. GIC + if (this->eval_type == 3) { return loss + this->ic_coef * log(double(N)) * log(log(double(train_n))) * algorithm->get_effective_number(); - } else if (ic_type == 4) { + } + // 4. EBIC + if (this->eval_type == 4) { return loss + this->ic_coef * (log(double(train_n)) + 2 * log(double(N))) * algorithm->get_effective_number(); - } else if (ic_type == 5) { + } + // 5. HIC + if (this->eval_type == 5) { return train_n * (algorithm->get_train_loss() - algorithm->lambda_level * algorithm->beta.cwiseAbs2().sum()) + this->ic_coef * log(double(N)) * log(log(double(train_n))) * algorithm->get_effective_number(); - } else - return 0; + } + if (this->raise_warning) { + cout << "[warning] No available IC type for training. Use loss instead. " + << "(E" << this->eval_type << "M" << algorithm->model_type << ")" << endl; + this->raise_warning = false; + } + // return 0; + return loss; }; - double loss_function(T4 &train_x, T1 &train_y, Eigen::VectorXd &train_weight, Eigen::VectorXi &g_index, - Eigen::VectorXi &g_size, int train_n, int p, int N, Algorithm *algorithm) { + double test_loss(T4 &test_x, T1 &test_y, Eigen::VectorXd &test_weight, Eigen::VectorXi &g_index, + Eigen::VectorXi &g_size, int test_n, int p, int N, Algorithm *algorithm) { + // test loss: for CV Eigen::VectorXi A = algorithm->get_A_out(); T2 beta = algorithm->get_beta(); T3 coef0 = algorithm->get_coef0(); Eigen::VectorXi A_ind = find_ind(A, g_index, g_size, beta.rows(), N); - T4 X_A = X_seg(train_x, train_n, A_ind, algorithm->model_type); + T4 test_X_A = X_seg(test_x, test_n, A_ind, algorithm->model_type); T2 beta_A; slice(beta, A_ind, beta_A); - // Eigen::VectorXd beta_A(A_ind.size()); - // for (int k = 0; k < A_ind.size(); k++) - // { - // beta_A(k) = beta(A_ind(k)); - // } - return algorithm->loss_function(X_A, train_y, train_weight, beta_A, coef0, A, g_index, g_size, 0.0); - } + // 0. only test loss + if (this->eval_type == 0) { + return algorithm->loss_function(test_X_A, test_y, test_weight, beta_A, coef0, A, g_index, g_size, + algorithm->lambda_level); + } + // 1. negative AUC (for logistic) + if (this->eval_type == 1 && algorithm->model_type == 2) { + // compute probability + Eigen::VectorXd test_y_temp = test_y; + Eigen::VectorXd proba = test_X_A * beta_A + coef0 * Eigen::VectorXd::Ones(test_n); + proba = proba.array().exp(); + proba = proba.cwiseQuotient(Eigen::VectorXd::Ones(test_n) + proba); + return -this->binary_auc_score(test_y_temp, proba); + } + // 2. 3. negative AUC, One vs One/Rest (for multinomial) + if (algorithm->model_type == 6) { + int M = test_y.cols(); + // compute probability + Eigen::MatrixXd proba = test_X_A * beta_A; + proba = rowwise_add(proba, coef0); + proba = proba.array().exp(); + Eigen::VectorXd proba_rowsum = proba.rowwise().sum(); + proba = proba.cwiseQuotient(proba_rowsum.replicate(1, p)); + // compute AUC + if (this->eval_type == 2) { + // (One vs One) the AUC of all possible pairwise combinations of classes + double auc = 0; + for (int i = 0; i < M - 1; i++) { + for (int j = i + 1; j < M; j++) { + int nij = 0; + Eigen::VectorXd test_y_i(test_n), test_y_j(test_n), proba_i(test_n), proba_j(test_n); + // extract samples who belongs to class i or j + for (int k = 0; k < test_n; k++) { + if (test_y(k, i) + test_y(k, j) > 0) { + test_y_i(nij) = test_y(k, i); + test_y_j(nij) = test_y(k, j); + proba_i(nij) = proba(k, i); + proba_j(nij) = proba(k, j); + nij++; + } + } + test_y_i = test_y_i.head(nij).eval(); + test_y_j = test_y_j.head(nij).eval(); + proba_i = proba_i.head(nij).eval(); + proba_j = proba_j.head(nij).eval(); + // get AUC + auc += this->binary_auc_score(test_y_i, proba_i); + auc += this->binary_auc_score(test_y_j, proba_j); + } + } + return -auc / (p * (p - 1)); + } + if (this->eval_type == 3) { + // (One vs Rest) the AUC of each class against the rest + double auc = 0; + for (int i = 0; i < M; i++) { + Eigen::VectorXd test_y_single = test_y.col(i); + Eigen::VectorXd proba_single = proba.col(i); + auc += this->binary_auc_score(test_y_single, proba_single); + } + return -auc / p; + } + } + if (this->raise_warning) { + cout << "[warning] No available CV score for training. Use test_loss instead. " + << "(E" << this->eval_type << "M" << algorithm->model_type << ")" << endl; + this->raise_warning = false; + } + // return 0; + return algorithm->loss_function(test_X_A, test_y, test_weight, beta_A, coef0, A, g_index, g_size, + algorithm->lambda_level); + }; + + double binary_auc_score(Eigen::VectorXd &true_label, Eigen::VectorXd &pred_proba) { + // sort proba from large to small + int n = true_label.rows(); + Eigen::VectorXi sort_ind = max_k(pred_proba, n, true); + + // use each value as threshold to get TPR, FPR + double tp = 0, fp = 0, positive = true_label.sum(); + double last_tpr = 0, last_fpr = 0, auc = 0; + + if (positive == 0 || positive == n) { + cout << "[Warning] There is only one class in the test data, " + << "the result may be meaningless. Please use another type of loss, " + << "or try to specify cv_fold_id." << endl; + } + + for (int i = 0; i < n; i++) { + // current threshold: pred_proba(sort_ind(i)) + int k = sort_ind(i); + tp += true_label(k); + fp += 1 - true_label(k); + // skip same threshold + if (i < n - 1) { + int kk = sort_ind(i + 1); + if (pred_proba(k) == pred_proba(kk)) continue; + } + // compute tpr, fpr + double tpr = tp / positive; + double fpr = fp / (n - positive); + if (fpr > last_fpr) { + auc += (tpr + last_tpr) / 2 * (fpr - last_fpr); + } + last_tpr = tpr; + last_fpr = fpr; + } + return auc; + }; // to do Eigen::VectorXd fit_and_evaluate_in_metric(std::vector *> algorithm_list, @@ -328,9 +450,8 @@ class Metric { // this->update_cv_initial_coef0(algorithm->get_coef0(), k); } - loss_list(k) = - this->loss_function(this->test_X_list[k], this->test_y_list[k], this->test_weight_list[k], g_index, - g_size, test_n, p, N, algorithm_list[k]); + loss_list(k) = this->test_loss(this->test_X_list[k], this->test_y_list[k], this->test_weight_list[k], + g_index, g_size, test_n, p, N, algorithm_list[k]); } } diff --git a/src/api.h b/src/api.h index 981623fe..2f0c9b5a 100644 --- a/src/api.h +++ b/src/api.h @@ -81,8 +81,7 @@ using namespace Rcpp; * s_max), where the specific support size to be considered is determined by golden section. * @param is_warm_start When tuning the optimal parameter combination, whether to use the last solution * as a warm start to accelerate the iterative convergence of the splicing algorithm. - * @param ic_type The type of criterion for choosing the support size. Available options are - * "gic", "ebic", "bic", "aic". + * @param ic_type The type of criterion for choosing the support size. * @param Kfold The folds number to use the Cross-validation method. If Kfold=1, * Cross-validation will not be used. * @param sequence An integer vector representing the alternative support sizes. Only used for diff --git a/src/normalize.cpp b/src/normalize.cpp index 98645675..edce5078 100644 --- a/src/normalize.cpp +++ b/src/normalize.cpp @@ -14,8 +14,7 @@ using namespace Rcpp; using namespace std; -void constant_warning_ith_variable(int i) -{ +void constant_warning_ith_variable(int i) { #ifdef R_BUILD Rcout << "Warning: the variable " << i + 1 << " is constant. "; Rcout << "It may cause NAN in the result. Please drop this variable or disable the normalization.\n"; diff --git a/src/path.h b/src/path.h index 79bd8feb..2d36c4a0 100644 --- a/src/path.h +++ b/src/path.h @@ -94,7 +94,7 @@ void sequential_path_cv(Data &data, Algorithm *a // evaluate the beta if (metric->is_cv) { test_loss_matrix(ind) = - metric->loss_function(test_x, test_y, test_weight, g_index, g_size, test_n, p, N, algorithm); + metric->test_loss(test_x, test_y, test_weight, g_index, g_size, test_n, p, N, algorithm); } else { ic_matrix(ind) = metric->ic(train_n, M, N, algorithm); } diff --git a/src/utilities.h b/src/utilities.h index 61529352..6b946ea4 100644 --- a/src/utilities.h +++ b/src/utilities.h @@ -124,15 +124,15 @@ class Parameters { } } - // void print_sequence() { - // // for debug - // std::cout << "==> Parameter List:" << endl; - // for (int i = 0; i < (this->sequence).size(); i++) { - // int support_size = (this->sequence(i)).support_size; - // double lambda = (this->sequence(i)).lambda; - // std::cout << " support_size = " << support_size << ", lambda = " << lambda << endl; - // } - // } + void print_sequence() { + // for debug + std::cout << "==> Parameter List:" << endl; + for (int i = 0; i < (this->sequence).size(); i++) { + int support_size = (this->sequence(i)).support_size; + double lambda = (this->sequence(i)).lambda; + std::cout << " support_size = " << support_size << ", lambda = " << lambda << endl; + } + } }; /** @@ -497,4 +497,11 @@ void trunc(Eigen::VectorXd &vec, double *trunc_range) { } } +Eigen::MatrixXd rowwise_add(Eigen::MatrixXd &m, Eigen::VectorXd &v) { return m.rowwise() + v.transpose(); } + +Eigen::MatrixXd rowwise_add(Eigen::MatrixXd &m, double &v) { + Eigen::VectorXd ones = Eigen::VectorXd::Ones(m.cols()); + return m.rowwise() + ones.transpose() * v; +} + #endif // SRC_UTILITIES_H diff --git a/src/workflow.h b/src/workflow.h index 0a0fcd3d..10a27c73 100644 --- a/src/workflow.h +++ b/src/workflow.h @@ -59,7 +59,7 @@ using namespace std; * @param algorithm_type type of algorithm * @param path_type type of path: 1 for sequencial search and 2 for golden section search * @param is_warm_start whether enable warm-start - * @param ic_type type of information criterion, used for not CV + * @param eval_type type of information criterion, or test loss in CV * @param Kfold number of folds, used for CV * @param parameters parameters to be selected, including `support_size`, `lambda` * @param screening_size size of screening @@ -74,7 +74,7 @@ using namespace std; */ template List abessWorkflow(T4 &x, T1 &y, int n, int p, int normalize_type, Eigen::VectorXd weight, int algorithm_type, - int path_type, bool is_warm_start, int ic_type, double ic_coef, int Kfold, Parameters parameters, + int path_type, bool is_warm_start, int eval_type, double ic_coef, int Kfold, Parameters parameters, int screening_size, Eigen::VectorXi g_index, bool early_stop, int thread, bool sparse_matrix, Eigen::VectorXi &cv_fold_id, Eigen::VectorXi &A_init, vector *> algorithm_list) { @@ -111,7 +111,7 @@ List abessWorkflow(T4 &x, T1 &y, int n, int p, int normalize_type, Eigen::Vector // if CV is enable, // specify train and test data, // and initialize the fitting argument inside each fold. - Metric *metric = new Metric(ic_type, ic_coef, Kfold); + Metric *metric = new Metric(eval_type, ic_coef, Kfold); if (Kfold > 1) { metric->set_cv_train_test_mask(data, data.n, cv_fold_id); metric->set_cv_init_fit_arg(beta_size, data.M);