Merge branch 'master' of https://github.com/statistikat/VIM

statistikat · Nov 10, 2023 · c6214f0 · c6214f0
2 parents 2f57943 + 0542de0
commit c6214f0
Show file tree

Hide file tree

Showing 21 changed files with 351 additions and 19 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -29,9 +29,9 @@ jobs:
           - {os: windows-latest, r: '3.6'}
 
           # Use older ubuntu to maximise backward compatibility
-          - {os: ubuntu-18.04,   r: 'devel', http-user-agent: 'release'}
-          - {os: ubuntu-18.04,   r: 'release'}
-          - {os: ubuntu-18.04,   r: 'oldrel-1'}
+          - {os: ubuntu-20.04,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-20.04,   r: 'release'}
+          - {os: ubuntu-20.04,   r: 'oldrel-1'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,6 +26,7 @@ Imports:
     laeken,
     ranger,
     MASS,
+    xgboost,
     data.table(>= 1.9.4)
 Suggests: 
     dplyr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -50,6 +50,7 @@ export(scattMiss)
 export(scattmatrixMiss)
 export(spineMiss)
 export(tableMiss)
+export(xgboostImpute)
 import(Rcpp)
 import(colorspace)
 import(data.table)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# VIM 6.x.x
+ - fix infinite loop in matchImpute in case all observations of a variable are missing
+
 # VIM 6.2.3
 - default robust regression method for irmi for numeric variables changes from rlm to lmrob.
 

diff --git a/R/gowerD.R b/R/gowerD.R
@@ -47,6 +47,19 @@ gowerD <- function(data.x, data.y = data.x,
                    returnMin=FALSE,
                    methodStand="range") {
   stopifnot(length(methodStand)==1&&methodStand%in%c("range", "iqr"))
+  if(length(levOrders)>0){
+    stopifnot(length(levOrders)==length(orders))
+    levOrdersUniqueX <- sapply(orders,function(x)length(unique(data.x[[x]])))
+    if(any(levOrdersUniqueX!=levOrders)){
+      warning("The number of unique values in the ordinal variables in data.x
+              does not correspond to the values given in levOrders")
+    }
+    levOrdersUniqueY <- sapply(orders,function(x)length(unique(data.x[[x]])))
+    if(any(levOrdersUniqueY!=levOrders)){
+      warning("The number of unique values in the ordinal variables in data.y
+              does not correspond to the values given in levOrders")
+    }
+  }
   maxplus1 <- function(x){
     if(all(is.na(x)))
       return(1)
@@ -86,6 +99,7 @@ gowerD <- function(data.x, data.y = data.x,
                         2,min0,na.rm=TRUE)
       rmax <- apply(rbind(data.x[,numerical,drop=FALSE],
                         data.y[,numerical,drop=FALSE]),2,max1,na.rm=TRUE)
+      print(c(rmin,rmax))
     }else if(methodStand == "iqr"){
       rmin <- apply(rbind(data.x[,numerical,drop=FALSE],data.y[,numerical,drop=FALSE]),2,quantile,na.rm=TRUE,
                     probs=.25)

diff --git a/R/matchImpute.R b/R/matchImpute.R
@@ -59,8 +59,14 @@ matchImpute <- function(data,variable=colnames(data)[!colnames(data)%in%match_va
     data <- as.data.table(data)
   else
     data <- data.table::copy(data)
+  tfna <- data[,sapply(lapply(.SD,is.na),all),.SDcols=variable]
+  if(any(tfna)){
+    stop(paste0(variable[tfna],collapse=", ")," ", ifelse(sum(tfna)>1,"are","is")," completely missing")
+
+  }
   na_present <- data[,sum(sapply(lapply(.SD,is.na),sum)),.SDcols=variable]
 
+
   if(imp_var){
     data[,paste(variable,imp_suffix,sep="_"):=lapply(.SD,is.na),.SDcols=variable]
   }

diff --git a/R/rangerImpute.R b/R/rangerImpute.R
@@ -31,22 +31,44 @@ rangerImpute <- function(formula, data, imp_var = TRUE,
   for (lhsV in lhs) {
     form <- as.formula(paste(lhsV, "~", rhs))
     lhs_vector <- data[[lhsV]]
+    lhs_isfactor <- inherits(lhs_vector, "factor")
+
     if (!any(is.na(lhs_vector))) {
       cat(paste0("No missings in ", lhsV, ".\n"))
     } else {
       lhs_na <- is.na(lhs_vector)
       if (verbose)
         message("Training   model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations")
-      mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), ..., verbose = verbose)
+
+      if(lhs_isfactor){
+        mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), probability = TRUE, ..., verbose = verbose)
+      }else{
+        mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), ..., verbose = verbose)
+      }
+
       if (verbose)
         message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")
-      if (median & inherits(lhs_vector, "numeric")) {
-        predictions <- apply(
-          predict(mod, subset(data, !rhs_na & lhs_na), predict.all = TRUE)$predictions,
-          1, median)
-      } else {
+
+      if(lhs_isfactor){
         predictions <- predict(mod, subset(data, !rhs_na & lhs_na))$predictions
+        predict_levels <- colnames(predictions)
+
+        predictions <- apply(predictions,1,function(z,lev){
+          z <- cumsum(z)
+          z_lev <- lev[z>runif(1)]
+          return(z_lev[1])
+        },lev=predict_levels)
+
+      }else{
+        if (median & inherits(lhs_vector, "numeric")) {
+          predictions <- apply(
+            predict(mod, subset(data, !rhs_na & lhs_na), predict.all = TRUE)$predictions,
+            1, median)
+        } else {
+          predictions <- predict(mod, subset(data, !rhs_na & lhs_na))$predictions
+        }
       }
+
       data[!rhs_na & lhs_na, lhsV] <- predictions
     }
 

diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R
@@ -0,0 +1,132 @@
+#' Xgboost Imputation
+#'
+#' Impute missing values based on a random forest model using [xgboost::xgboost()]
+#' @param formula model formula for the imputation
+#' @param data A `data.frame` containing the data
+#' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed
+#' variable should be created show the imputation status
+#' @param imp_suffix suffix used for TF imputation variables
+#' @param verbose Show the number of observations used for training
+#'   and evaluating the RF-Model. This parameter is also passed down to
+#'   [xgboost::xgboost()] to show computation status.
+#' @param ... Arguments passed to [xgboost::xgboost()]
+#' @param nrounds max number of boosting iterations,
+#'   argument passed to [xgboost::xgboost()]
+#' @param objective objective for xgboost,
+#'   argument passed to [xgboost::xgboost()]
+#' @return the imputed data set.
+#' @family imputation methods
+#' @examples 
+#' data(sleep)
+#' xgboostImpute(Dream~BodyWgt+BrainWgt,data=sleep)
+#' xgboostImpute(Dream+NonD~BodyWgt+BrainWgt,data=sleep)
+#' xgboostImpute(Dream+NonD+Gest~BodyWgt+BrainWgt,data=sleep)
+#' 
+#' sleepx <- sleep
+#' sleepx$Pred <- as.factor(LETTERS[sleepx$Pred])
+#' sleepx$Pred[1] <- NA
+#' xgboostImpute(Pred~BodyWgt+BrainWgt,data=sleepx)
+#' @export
+xgboostImpute <- function(formula, data, imp_var = TRUE,
+                         imp_suffix = "imp", verbose = FALSE,
+                         nrounds=100, objective=NULL,
+                         ...){
+  check_data(data)
+  formchar <- as.character(formula)
+  lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]])
+  rhs <- formchar[3]
+  rhs2 <- gsub(" ", "", strsplit(rhs, "\\+")[[1]])
+  #Missings in RHS variables
+  rhs_na <- apply(subset(data, select = rhs2), 1, function(x) any(is.na(x)))
+  #objective should be a vector of lenght equal to the lhs variables
+  if(!is.null(objective)){
+    stopifnot(length(objective)!=length(lhs))
+  }
+  for (lhsV in lhs) {
+    form <- as.formula(paste(lhsV, "~", rhs,"-1"))
+    # formula without left side for prediction
+    formPred <- as.formula(paste( "~", rhs,"-1"))
+    lhs_vector <- data[[lhsV]]
+    num_class <- NULL
+    if (!any(is.na(lhs_vector))) {
+      cat(paste0("No missings in ", lhsV, ".\n"))
+    } else {
+      lhs_na <- is.na(lhs_vector)
+      if (verbose)
+        message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations")
+      dattmp <- subset(data, !rhs_na & !lhs_na)
+      labtmp <- dattmp[[lhsV]]
+      currentClass <- NULL
+      if(inherits(labtmp,"factor")){
+        currentClass <- "factor"
+        labtmp <- as.integer(labtmp)-1
+        if(length(unique(labtmp))==2){
+          objective <- "binary:logistic"
+        }else if(length(unique(labtmp))>2){
+          objective <- "multi:softmax"
+          num_class <- max(labtmp)+1
+        }
+
+      }else if(inherits(labtmp,"integer")){
+        currentClass <- "integer"
+        if(length(unique(labtmp))==2){
+          lvlsInt <- unique(labtmp)
+          labtmp <- match(labtmp,lvlsInt)-1
+          warning("binary factor detected but not probproperlyably stored as factor.")
+          objective <- "binary:logistic"
+        }else{
+          objective <- "count:poisson"## Todo: this might not be wise as default
+        }
+      }else if(inherits(labtmp,"numeric")){
+        currentClass <- "numeric"
+        if(length(unique(labtmp))==2){
+          lvlsInt <- unique(labtmp)
+          labtmp <- match(labtmp,lvlsInt)-1
+          warning("binary factor detected but not properly stored as factor.")
+          objective <- "binary:logistic"
+        }else{
+          objective <- "reg:squarederror"
+        }
+      }
+
+
+      mm <- model.matrix(form,dattmp)
+      if(!is.null(num_class)){
+        mod <- xgboost::xgboost(data = mm, label = labtmp,
+                                nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose,...)
+      }else{
+        mod <- xgboost::xgboost(data = mm, label = labtmp,
+                                nrounds=nrounds, objective=objective, verbose = verbose,...)
+      }
+
+      if (verbose)
+        message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")
+      predictions <- 
+        predict(mod, model.matrix(formPred,subset(data, !rhs_na & lhs_na)))
+      if(currentClass=="factor"){
+        if(is.null(num_class)){
+          data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[as.numeric(predictions>.5)+1]  
+        }else{
+          data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[predictions+1]  
+        }
+      }else if(currentClass%in%c("numeric","integer")&objective=="binary:logistic"){
+        data[!rhs_na & lhs_na, lhsV] <- lvlsInt[as.numeric(predictions>.5)+1]  
+      }else{
+        data[!rhs_na & lhs_na, lhsV] <- predictions  
+      }
+
+    }
+
+    if (imp_var) {
+      if (imp_var %in% colnames(data)) {
+        data[, paste(lhsV, "_", imp_suffix, sep = "")] <- as.logical(data[, paste(lhsV, "_", imp_suffix, sep = "")])
+        warning(paste("The following TRUE/FALSE imputation status variables will be updated:",
+                      paste(lhsV, "_", imp_suffix, sep = "")))
+      } else {
+        data$NEWIMPTFVARIABLE <- is.na(lhs_vector)
+        colnames(data)[ncol(data)] <- paste(lhsV, "_", imp_suffix, sep = "")
+      }
+    }
+  }
+  data
+}
diff --git a/inst/tinytest/test_matchImpute.R b/inst/tinytest/test_matchImpute.R
@@ -0,0 +1,26 @@
+library(VIM)
+message("matchImpute general")
+setna <- function(d,i,col=2){
+  d[i,col] <- NA
+  d
+}
+d <- data.frame(x=LETTERS[1:6],y=as.double(1:6),z=as.double(1:6),
+                w=ordered(LETTERS[1:6]), stringsAsFactors = FALSE)
+dorig <- rbind(d,d)
+# minimal example with one match var
+d1 <- matchImpute(setna(dorig,7:12,1)[,1:2],match_var = "y", variable="x")
+expect_identical(d1$x[d1$x_imp],d1$x[!d1$x_imp])
+
+d1b <- matchImpute(setna(dorig,7:12,1)[,1:2],match_var = "y", variable="x", imp_var = FALSE)
+expect_identical(d1b$x[d1$x_imp],d1b$x[!d1$x_imp])
+expect_false("x_imp" %in% colnames(d1b))
+expect_true("x_imp" %in% colnames(d1))
+
+
+# all missing in x -> error
+expect_error(matchImpute(setna(dorig,1:12,1)[,1:2],match_var = "y", variable="x"))
+
+
+# example with two match vars
+d1 <- matchImpute(setna(dorig,7:12,1)[,1:3],match_var = c("y","z"), variable="x")
+expect_identical(d1$x[d1$x_imp],d1$x[!d1$x_imp])
diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R
@@ -0,0 +1,49 @@
+library(VIM)
+set.seed(104)
+x <- rnorm(100)
+df <- data.frame(
+  y = x + rnorm(100, sd = .01),
+  x = x,
+  fac = as.factor(x >= 0)
+)
+
+max_dist <- function(x, y) {
+  max(abs(x - y))
+}
+
+df$y[1:3] <- NA
+df$fac[3:5] <- NA
+df$binNum <- as.integer(df$fac)+17
+df$binInt <- as.integer(df$fac)+17L
+# xgboostImpute accuracy", {
+  df.out <- xgboostImpute(y ~ x, df)
+  expect_true(
+    max_dist(df.out$y, df$x)<
+    0.06
+  )
+
+  # xgboostImpute should do nothing for no missings", {
+  df.out <- xgboostImpute(x ~ y, df)
+  expect_identical(df.out$x, df$x)
+# 
+
+# factor response predicted accurately", {
+  df.out <- xgboostImpute(fac ~ x, df)
+  expect_identical(df.out$fac, as.factor(df$x >= 0))
+# 
+
+  # interger binary response predicted accurately", {
+  expect_warning(df.out <- xgboostImpute(binInt ~ x, df))
+  expect_identical(df.out$binInt==19, df$x >= 0)
+  # 
+  # numeric binary response predicted accurately", {
+  expect_warning(df.out <- xgboostImpute(binNum ~ x, df))
+  expect_identical(df.out$binNum==19, df$x >= 0)
+  # 
+# factor regressor used reasonably", {
+  df2 <- df
+  df2$x[1:10] <- NA
+  df.out <- xgboostImpute(x ~ fac, df2)
+  expect_identical(as.factor(df.out$x >= 0), df$fac)
+# 
+
diff --git a/man/hotdeck.Rd b/man/hotdeck.Rd
diff --git a/man/impPCA.Rd b/man/impPCA.Rd
diff --git a/man/irmi.Rd b/man/irmi.Rd
diff --git a/man/kNN.Rd b/man/kNN.Rd
diff --git a/man/matchImpute.Rd b/man/matchImpute.Rd
diff --git a/man/medianSamp.Rd b/man/medianSamp.Rd