Merge pull request #477 from oooo26/master

Support AUC on CV for classification problem
abess-team · Jan 31, 2023 · 7d2bd24 · 7d2bd24
2 parents 994fe73 + c903187
commit 7d2bd24
Show file tree

Hide file tree

Showing 18 changed files with 352 additions and 135 deletions.
diff --git a/R-package/R/abesspca.R b/R-package/R/abesspca.R
@@ -235,7 +235,7 @@ abesspca <- function(x,
     exchange_num = c_max,
     path_type = path_type,
     is_warm_start = warm.start,
-    ic_type = 1,
+    ic_type = ic_type,
     ic_coef = ic_scale,
     Kfold = nfolds,
     sequence = s_list_bool,

diff --git a/R-package/R/utility.R b/R-package/R/utility.R
@@ -82,7 +82,7 @@ map_tunetype2numeric <- function(tune.type) {
     "bic" = 2,
     "gic" = 3,
     "ebic" = 4,
-    "cv" = 1
+    "cv" = 0
   )
   ic_type
 }

diff --git a/python/abess/bess_base.py b/python/abess/bess_base.py
@@ -40,7 +40,7 @@ class bess_base(BaseEstimator):
 
         - If alpha = 0, it indicates ordinary least square.
 
-    ic_type : {'aic', 'bic', 'gic', 'ebic'}, optional, default='ebic'
+    ic_type : {'aic', 'bic', 'gic', 'ebic', 'loss'}, optional, default='ebic'
         The type of criterion for choosing the support size if `cv=1`.
     ic_coef : float, optional, default=1.0
         Constant that controls the regularization strength
@@ -52,6 +52,14 @@ class bess_base(BaseEstimator):
         - If cv>1, support size will be chosen by CV's test loss,
           instead of IC.
 
+    cv_score : {'test_loss', ...}, optional, default='test_loss'
+        The score used on test data for CV.
+
+        - All methods support {'test_loss'}.
+        - LogisticRegression also supports {'roc_auc'}.
+        - MultinomialRegression also supports {'roc_auc_ovo', 'roc_auc_ovr'},
+          which indicate "One vs One/Rest" algorithm, respectively.
+
     thread : int, optional, default=1
         Max number of multithreads.
 
@@ -131,6 +139,7 @@ def __init__(
         ic_type="ebic",
         ic_coef=1.0,
         cv=1,
+        cv_score="test_loss",
         thread=1,
         A_init=None,
         always_select=None,
@@ -170,6 +179,7 @@ def __init__(
         self.ic_type = ic_type
         self.ic_coef = ic_coef
         self.cv = cv
+        self.cv_score = cv_score
         self.screening_size = screening_size
         self.always_select = always_select
         self.primary_model_fit_max_iter = primary_model_fit_max_iter
@@ -323,28 +333,49 @@ def fit(self,
         else:
             raise ValueError("path_type should be \'seq\' or \'gs\'")
 
-        # Ic_type: aic, bic, gic, ebic
-        if self.ic_type == "aic":
-            ic_type_int = 1
-        elif self.ic_type == "bic":
-            ic_type_int = 2
-        elif self.ic_type == "gic":
-            ic_type_int = 3
-        elif self.ic_type == "ebic":
-            ic_type_int = 4
-        elif self.ic_type == "hic":
-            ic_type_int = 5
-        else:
-            raise ValueError(
-                "ic_type should be \"aic\", \"bic\", \"ebic\","
-                " \"gic\" or \"hic\".")
-
         # cv
         if (not isinstance(self.cv, int) or self.cv <= 0):
             raise ValueError("cv should be an positive integer.")
         if self.cv > n:
             raise ValueError("cv should be smaller than n.")
 
+        # Ic_type: aic, bic, gic, ebic
+        # cv_score: test_loss, roc_auc
+        if self.cv == 1:
+            if self.ic_type == "loss":
+                eval_type_int = 0
+            elif self.ic_type == "aic":
+                eval_type_int = 1
+            elif self.ic_type == "bic":
+                eval_type_int = 2
+            elif self.ic_type == "gic":
+                eval_type_int = 3
+            elif self.ic_type == "ebic":
+                eval_type_int = 4
+            elif self.ic_type == "hic":
+                eval_type_int = 5
+            else:
+                raise ValueError(
+                    "ic_type should be \"aic\", \"bic\", \"ebic\","
+                    " \"gic\" or \"hic\".")
+        else:
+            if self.cv_score == "test_loss":
+                eval_type_int = 0
+            elif self.cv_score == "roc_auc" and self.model_type == "Logistic":
+                eval_type_int = 1
+            elif (self.cv_score == "roc_auc_ovo" and
+                  self.model_type == "Multinomial"):
+                eval_type_int = 2
+            elif (self.cv_score == "roc_auc_ovr" and
+                  self.model_type == "Multinomial"):
+                eval_type_int = 3
+            else:
+                raise ValueError(
+                    "cv_score should be \"test_loss\", "
+                    "\"roc_auc\"(for logistic), "
+                    "\"roc_auc_ovo\"(for multinomial), or "
+                    "\"roc_auc_ovr\"(for multinomial).")
+
         # cv_fold_id
         if cv_fold_id is None:
             cv_fold_id = np.array([], dtype="int32")
@@ -561,7 +592,7 @@ def fit(self,
                 X, y, sample_weight, n, p, normalize, algorithm_type_int,
                 model_type_int,
                 self.max_iter, self.exchange_num, path_type_int,
-                self.is_warm_start, ic_type_int, self.ic_coef, self.cv,
+                self.is_warm_start, eval_type_int, self.ic_coef, self.cv,
                 g_index,
                 support_sizes, alphas, cv_fold_id, new_s_min, new_s_max,
                 new_lambda_min, new_lambda_max, n_lambda, self.screening_size,

diff --git a/python/abess/decomposition.py b/python/abess/decomposition.py
@@ -41,6 +41,10 @@ class SparsePCA(bess_base):
         - If cv>1, support size will be chosen by CV's test loss,
           instead of IC.
 
+    cv_score : {'test_loss'}, optional, default='test_loss'
+        The score used on test data for CV.
+        Only 'test_loss' is supported for PCA now.
+
     thread : int, optional, default=1
         Max number of multithreads.
 
@@ -125,8 +129,9 @@ class SparsePCA(bess_base):
     """
 
     def __init__(self, support_size=None, group=None,
-                 ic_type="loss", ic_coef=1.0, cv=1, thread=1,
-                 A_init=None, always_select=None,
+                 ic_type="loss", ic_coef=1.0,
+                 cv=1, cv_score="test_loss",
+                 thread=1, A_init=None, always_select=None,
                  max_iter=20, exchange_num=5, is_warm_start=True,
                  splicing_type=1,
                  screening_size=-1,
@@ -137,7 +142,7 @@ def __init__(self, support_size=None, group=None,
             max_iter=max_iter, exchange_num=exchange_num,
             is_warm_start=is_warm_start, support_size=support_size,
             # s_min=s_min, s_max=s_max,
-            ic_type=ic_type, ic_coef=ic_coef, cv=cv,
+            ic_type=ic_type, ic_coef=ic_coef, cv=cv, cv_score=cv_score,
             screening_size=screening_size,
             always_select=always_select,
             thread=thread,
@@ -264,23 +269,31 @@ def fit(self, X=None, y=None, is_normal=False,
         # model_type_int = 7
         path_type_int = 1
 
-        # Ic_type
-        if self.ic_type == "loss":
-            ic_type_int = 0
-        elif self.ic_type == "aic":
-            ic_type_int = 1
-        elif self.ic_type == "bic":
-            ic_type_int = 2
-        elif self.ic_type == "gic":
-            ic_type_int = 3
-        elif self.ic_type == "ebic":
-            ic_type_int = 4
-        elif self.ic_type == "hic":
-            ic_type_int = 5
+        # Ic_type: aic, bic, gic, ebic
+        # cv_score: test_loss, roc_auc
+        if self.cv == 1:
+            if self.ic_type == "loss":
+                eval_type_int = 0
+            elif self.ic_type == "aic":
+                eval_type_int = 1
+            elif self.ic_type == "bic":
+                eval_type_int = 2
+            elif self.ic_type == "gic":
+                eval_type_int = 3
+            elif self.ic_type == "ebic":
+                eval_type_int = 4
+            elif self.ic_type == "hic":
+                eval_type_int = 5
+            else:
+                raise ValueError(
+                    "ic_type should be \"aic\", \"bic\", \"ebic\","
+                    " \"gic\" or \"hic\".")
         else:
-            raise ValueError(
-                "ic_type should be \"loss\", \"aic\", \"bic\","
-                " \"ebic\", \"gic\" or \"hic\".")
+            if self.cv_score == "test_loss":
+                eval_type_int = 0
+            else:
+                raise ValueError(
+                    "cv_score should be \"test_loss\".")
 
         # cv
         if (not isinstance(self.cv, int) or self.cv <= 0):
@@ -425,7 +438,7 @@ def fit(self, X=None, y=None, is_normal=False,
                 n, p, normalize, Sigma,
                 self.max_iter, self.exchange_num,
                 path_type_int, self.is_warm_start,
-                ic_type_int, self.ic_coef, self.cv,
+                eval_type_int, self.ic_coef, self.cv,
                 g_index,
                 support_sizes,
                 cv_fold_id,
@@ -633,15 +646,15 @@ def fit(self, X, y=None, r=None, sparse_matrix=False):
 
         # Ic_type
         if self.ic_type == "aic":
-            ic_type_int = 1
+            eval_type_int = 1
         elif self.ic_type == "bic":
-            ic_type_int = 2
+            eval_type_int = 2
         elif self.ic_type == "gic":
-            ic_type_int = 3
+            eval_type_int = 3
         elif self.ic_type == "ebic":
-            ic_type_int = 4
+            eval_type_int = 4
         elif self.ic_type == "hic":
-            ic_type_int = 5
+            eval_type_int = 5
         else:
             raise ValueError(
                 "ic_type should be \"aic\", \"bic\", \"ebic\", \"gic\", "
@@ -769,7 +782,7 @@ def fit(self, X, y=None, r=None, sparse_matrix=False):
                 X, n, p, normalize,
                 self.max_iter, self.exchange_num,
                 path_type_int, self.is_warm_start,
-                ic_type_int, self.ic_coef,
+                eval_type_int, self.ic_coef,
                 g_index,
                 support_sizes,
                 alphas,