diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 55c5533..afd704f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -29,9 +29,9 @@ jobs: - {os: windows-latest, r: '3.6'} # Use older ubuntu to maximise backward compatibility - - {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-18.04, r: 'release'} - - {os: ubuntu-18.04, r: 'oldrel-1'} + - {os: ubuntu-20.04, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-20.04, r: 'release'} + - {os: ubuntu-20.04, r: 'oldrel-1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index 06d2d54..cfc21d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Imports: laeken, ranger, MASS, + xgboost, data.table(>= 1.9.4) Suggests: dplyr, diff --git a/NAMESPACE b/NAMESPACE index 539f0c1..c3182e8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ export(scattMiss) export(scattmatrixMiss) export(spineMiss) export(tableMiss) +export(xgboostImpute) import(Rcpp) import(colorspace) import(data.table) diff --git a/NEWS.md b/NEWS.md index c1b90ed..4cdb7d4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# VIM 6.x.x + - fix infinite loop in matchImpute in case all observations of a variable are missing + # VIM 6.2.3 - default robust regression method for irmi for numeric variables changes from rlm to lmrob. diff --git a/R/gowerD.R b/R/gowerD.R index 7dfb314..83bc2ca 100644 --- a/R/gowerD.R +++ b/R/gowerD.R @@ -47,6 +47,19 @@ gowerD <- function(data.x, data.y = data.x, returnMin=FALSE, methodStand="range") { stopifnot(length(methodStand)==1&&methodStand%in%c("range", "iqr")) + if(length(levOrders)>0){ + stopifnot(length(levOrders)==length(orders)) + levOrdersUniqueX <- sapply(orders,function(x)length(unique(data.x[[x]]))) + if(any(levOrdersUniqueX!=levOrders)){ + warning("The number of unique values in the ordinal variables in data.x + does not correspond to the values given in levOrders") + } + levOrdersUniqueY <- sapply(orders,function(x)length(unique(data.x[[x]]))) + if(any(levOrdersUniqueY!=levOrders)){ + warning("The number of unique values in the ordinal variables in data.y + does not correspond to the values given in levOrders") + } + } maxplus1 <- function(x){ if(all(is.na(x))) return(1) @@ -86,6 +99,7 @@ gowerD <- function(data.x, data.y = data.x, 2,min0,na.rm=TRUE) rmax <- apply(rbind(data.x[,numerical,drop=FALSE], data.y[,numerical,drop=FALSE]),2,max1,na.rm=TRUE) + print(c(rmin,rmax)) }else if(methodStand == "iqr"){ rmin <- apply(rbind(data.x[,numerical,drop=FALSE],data.y[,numerical,drop=FALSE]),2,quantile,na.rm=TRUE, probs=.25) diff --git a/R/matchImpute.R b/R/matchImpute.R index f85c1a9..3a0660e 100644 --- a/R/matchImpute.R +++ b/R/matchImpute.R @@ -59,8 +59,14 @@ matchImpute <- function(data,variable=colnames(data)[!colnames(data)%in%match_va data <- as.data.table(data) else data <- data.table::copy(data) + tfna <- data[,sapply(lapply(.SD,is.na),all),.SDcols=variable] + if(any(tfna)){ + stop(paste0(variable[tfna],collapse=", ")," ", ifelse(sum(tfna)>1,"are","is")," completely missing") + + } na_present <- data[,sum(sapply(lapply(.SD,is.na),sum)),.SDcols=variable] + if(imp_var){ data[,paste(variable,imp_suffix,sep="_"):=lapply(.SD,is.na),.SDcols=variable] } diff --git a/R/rangerImpute.R b/R/rangerImpute.R index eeb720a..713cb00 100644 --- a/R/rangerImpute.R +++ b/R/rangerImpute.R @@ -31,22 +31,44 @@ rangerImpute <- function(formula, data, imp_var = TRUE, for (lhsV in lhs) { form <- as.formula(paste(lhsV, "~", rhs)) lhs_vector <- data[[lhsV]] + lhs_isfactor <- inherits(lhs_vector, "factor") + if (!any(is.na(lhs_vector))) { cat(paste0("No missings in ", lhsV, ".\n")) } else { lhs_na <- is.na(lhs_vector) if (verbose) message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations") - mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), ..., verbose = verbose) + + if(lhs_isfactor){ + mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), probability = TRUE, ..., verbose = verbose) + }else{ + mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), ..., verbose = verbose) + } + if (verbose) message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations") - if (median & inherits(lhs_vector, "numeric")) { - predictions <- apply( - predict(mod, subset(data, !rhs_na & lhs_na), predict.all = TRUE)$predictions, - 1, median) - } else { + + if(lhs_isfactor){ predictions <- predict(mod, subset(data, !rhs_na & lhs_na))$predictions + predict_levels <- colnames(predictions) + + predictions <- apply(predictions,1,function(z,lev){ + z <- cumsum(z) + z_lev <- lev[z>runif(1)] + return(z_lev[1]) + },lev=predict_levels) + + }else{ + if (median & inherits(lhs_vector, "numeric")) { + predictions <- apply( + predict(mod, subset(data, !rhs_na & lhs_na), predict.all = TRUE)$predictions, + 1, median) + } else { + predictions <- predict(mod, subset(data, !rhs_na & lhs_na))$predictions + } } + data[!rhs_na & lhs_na, lhsV] <- predictions } diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R new file mode 100644 index 0000000..b2923a0 --- /dev/null +++ b/R/xgboostImpute.R @@ -0,0 +1,132 @@ +#' Xgboost Imputation +#' +#' Impute missing values based on a random forest model using [xgboost::xgboost()] +#' @param formula model formula for the imputation +#' @param data A `data.frame` containing the data +#' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed +#' variable should be created show the imputation status +#' @param imp_suffix suffix used for TF imputation variables +#' @param verbose Show the number of observations used for training +#' and evaluating the RF-Model. This parameter is also passed down to +#' [xgboost::xgboost()] to show computation status. +#' @param ... Arguments passed to [xgboost::xgboost()] +#' @param nrounds max number of boosting iterations, +#' argument passed to [xgboost::xgboost()] +#' @param objective objective for xgboost, +#' argument passed to [xgboost::xgboost()] +#' @return the imputed data set. +#' @family imputation methods +#' @examples +#' data(sleep) +#' xgboostImpute(Dream~BodyWgt+BrainWgt,data=sleep) +#' xgboostImpute(Dream+NonD~BodyWgt+BrainWgt,data=sleep) +#' xgboostImpute(Dream+NonD+Gest~BodyWgt+BrainWgt,data=sleep) +#' +#' sleepx <- sleep +#' sleepx$Pred <- as.factor(LETTERS[sleepx$Pred]) +#' sleepx$Pred[1] <- NA +#' xgboostImpute(Pred~BodyWgt+BrainWgt,data=sleepx) +#' @export +xgboostImpute <- function(formula, data, imp_var = TRUE, + imp_suffix = "imp", verbose = FALSE, + nrounds=100, objective=NULL, + ...){ + check_data(data) + formchar <- as.character(formula) + lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]]) + rhs <- formchar[3] + rhs2 <- gsub(" ", "", strsplit(rhs, "\\+")[[1]]) + #Missings in RHS variables + rhs_na <- apply(subset(data, select = rhs2), 1, function(x) any(is.na(x))) + #objective should be a vector of lenght equal to the lhs variables + if(!is.null(objective)){ + stopifnot(length(objective)!=length(lhs)) + } + for (lhsV in lhs) { + form <- as.formula(paste(lhsV, "~", rhs,"-1")) + # formula without left side for prediction + formPred <- as.formula(paste( "~", rhs,"-1")) + lhs_vector <- data[[lhsV]] + num_class <- NULL + if (!any(is.na(lhs_vector))) { + cat(paste0("No missings in ", lhsV, ".\n")) + } else { + lhs_na <- is.na(lhs_vector) + if (verbose) + message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations") + dattmp <- subset(data, !rhs_na & !lhs_na) + labtmp <- dattmp[[lhsV]] + currentClass <- NULL + if(inherits(labtmp,"factor")){ + currentClass <- "factor" + labtmp <- as.integer(labtmp)-1 + if(length(unique(labtmp))==2){ + objective <- "binary:logistic" + }else if(length(unique(labtmp))>2){ + objective <- "multi:softmax" + num_class <- max(labtmp)+1 + } + + }else if(inherits(labtmp,"integer")){ + currentClass <- "integer" + if(length(unique(labtmp))==2){ + lvlsInt <- unique(labtmp) + labtmp <- match(labtmp,lvlsInt)-1 + warning("binary factor detected but not probproperlyably stored as factor.") + objective <- "binary:logistic" + }else{ + objective <- "count:poisson"## Todo: this might not be wise as default + } + }else if(inherits(labtmp,"numeric")){ + currentClass <- "numeric" + if(length(unique(labtmp))==2){ + lvlsInt <- unique(labtmp) + labtmp <- match(labtmp,lvlsInt)-1 + warning("binary factor detected but not properly stored as factor.") + objective <- "binary:logistic" + }else{ + objective <- "reg:squarederror" + } + } + + + mm <- model.matrix(form,dattmp) + if(!is.null(num_class)){ + mod <- xgboost::xgboost(data = mm, label = labtmp, + nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose,...) + }else{ + mod <- xgboost::xgboost(data = mm, label = labtmp, + nrounds=nrounds, objective=objective, verbose = verbose,...) + } + + if (verbose) + message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations") + predictions <- + predict(mod, model.matrix(formPred,subset(data, !rhs_na & lhs_na))) + if(currentClass=="factor"){ + if(is.null(num_class)){ + data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[as.numeric(predictions>.5)+1] + }else{ + data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[predictions+1] + } + }else if(currentClass%in%c("numeric","integer")&objective=="binary:logistic"){ + data[!rhs_na & lhs_na, lhsV] <- lvlsInt[as.numeric(predictions>.5)+1] + }else{ + data[!rhs_na & lhs_na, lhsV] <- predictions + } + + } + + if (imp_var) { + if (imp_var %in% colnames(data)) { + data[, paste(lhsV, "_", imp_suffix, sep = "")] <- as.logical(data[, paste(lhsV, "_", imp_suffix, sep = "")]) + warning(paste("The following TRUE/FALSE imputation status variables will be updated:", + paste(lhsV, "_", imp_suffix, sep = ""))) + } else { + data$NEWIMPTFVARIABLE <- is.na(lhs_vector) + colnames(data)[ncol(data)] <- paste(lhsV, "_", imp_suffix, sep = "") + } + } + } + data +} diff --git a/inst/tinytest/test_matchImpute.R b/inst/tinytest/test_matchImpute.R new file mode 100644 index 0000000..d33230e --- /dev/null +++ b/inst/tinytest/test_matchImpute.R @@ -0,0 +1,26 @@ +library(VIM) +message("matchImpute general") +setna <- function(d,i,col=2){ + d[i,col] <- NA + d +} +d <- data.frame(x=LETTERS[1:6],y=as.double(1:6),z=as.double(1:6), + w=ordered(LETTERS[1:6]), stringsAsFactors = FALSE) +dorig <- rbind(d,d) +# minimal example with one match var +d1 <- matchImpute(setna(dorig,7:12,1)[,1:2],match_var = "y", variable="x") +expect_identical(d1$x[d1$x_imp],d1$x[!d1$x_imp]) + +d1b <- matchImpute(setna(dorig,7:12,1)[,1:2],match_var = "y", variable="x", imp_var = FALSE) +expect_identical(d1b$x[d1$x_imp],d1b$x[!d1$x_imp]) +expect_false("x_imp" %in% colnames(d1b)) +expect_true("x_imp" %in% colnames(d1)) + + +# all missing in x -> error +expect_error(matchImpute(setna(dorig,1:12,1)[,1:2],match_var = "y", variable="x")) + + +# example with two match vars +d1 <- matchImpute(setna(dorig,7:12,1)[,1:3],match_var = c("y","z"), variable="x") +expect_identical(d1$x[d1$x_imp],d1$x[!d1$x_imp]) diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R new file mode 100644 index 0000000..8aafba9 --- /dev/null +++ b/inst/tinytest/test_xgboostImpute.R @@ -0,0 +1,49 @@ +library(VIM) +set.seed(104) +x <- rnorm(100) +df <- data.frame( + y = x + rnorm(100, sd = .01), + x = x, + fac = as.factor(x >= 0) +) + +max_dist <- function(x, y) { + max(abs(x - y)) +} + +df$y[1:3] <- NA +df$fac[3:5] <- NA +df$binNum <- as.integer(df$fac)+17 +df$binInt <- as.integer(df$fac)+17L +# xgboostImpute accuracy", { + df.out <- xgboostImpute(y ~ x, df) + expect_true( + max_dist(df.out$y, df$x)< + 0.06 + ) + + # xgboostImpute should do nothing for no missings", { + df.out <- xgboostImpute(x ~ y, df) + expect_identical(df.out$x, df$x) +# + +# factor response predicted accurately", { + df.out <- xgboostImpute(fac ~ x, df) + expect_identical(df.out$fac, as.factor(df$x >= 0)) +# + + # interger binary response predicted accurately", { + expect_warning(df.out <- xgboostImpute(binInt ~ x, df)) + expect_identical(df.out$binInt==19, df$x >= 0) + # + # numeric binary response predicted accurately", { + expect_warning(df.out <- xgboostImpute(binNum ~ x, df)) + expect_identical(df.out$binNum==19, df$x >= 0) + # +# factor regressor used reasonably", { + df2 <- df + df2$x[1:10] <- NA + df.out <- xgboostImpute(x ~ fac, df2) + expect_identical(as.factor(df.out$x >= 0), df$fac) +# + diff --git a/man/hotdeck.Rd b/man/hotdeck.Rd index 99e09c9..c73cc5c 100644 --- a/man/hotdeck.Rd +++ b/man/hotdeck.Rd @@ -100,7 +100,8 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \author{ Alexander Kowarik diff --git a/man/impPCA.Rd b/man/impPCA.Rd index 1e10ca8..d946c3e 100644 --- a/man/impPCA.Rd +++ b/man/impPCA.Rd @@ -91,7 +91,8 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \author{ Matthias Templ diff --git a/man/irmi.Rd b/man/irmi.Rd index 064d3b9..2c7d3e0 100644 --- a/man/irmi.Rd +++ b/man/irmi.Rd @@ -160,7 +160,8 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \author{ Matthias Templ, Alexander Kowarik diff --git a/man/kNN.Rd b/man/kNN.Rd index d730445..968597d 100644 --- a/man/kNN.Rd +++ b/man/kNN.Rd @@ -123,7 +123,8 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \author{ Alexander Kowarik, Statistik Austria diff --git a/man/matchImpute.Rd b/man/matchImpute.Rd index 42f8699..760e79a 100644 --- a/man/matchImpute.Rd +++ b/man/matchImpute.Rd @@ -59,7 +59,8 @@ Other imputation methods: \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \author{ Johannes Gussenbauer, Alexander Kowarik diff --git a/man/medianSamp.Rd b/man/medianSamp.Rd index a8c7896..b3598a6 100644 --- a/man/medianSamp.Rd +++ b/man/medianSamp.Rd @@ -24,6 +24,7 @@ Other imputation methods: \code{\link{matchImpute}()}, \code{\link{rangerImpute}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \concept{imputation methods} diff --git a/man/rangerImpute.Rd b/man/rangerImpute.Rd index ddfc418..3feafdc 100644 --- a/man/rangerImpute.Rd +++ b/man/rangerImpute.Rd @@ -52,6 +52,7 @@ Other imputation methods: \code{\link{matchImpute}()}, \code{\link{medianSamp}()}, \code{\link{regressionImp}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \concept{imputation methods} diff --git a/man/regressionImp.Rd b/man/regressionImp.Rd index 1859e30..85e949a 100644 --- a/man/regressionImp.Rd +++ b/man/regressionImp.Rd @@ -68,7 +68,8 @@ Other imputation methods: \code{\link{matchImpute}()}, \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, -\code{\link{sampleCat}()} +\code{\link{sampleCat}()}, +\code{\link{xgboostImpute}()} } \author{ Alexander Kowarik diff --git a/man/sampleCat.Rd b/man/sampleCat.Rd index be7c8a6..9a6614d 100644 --- a/man/sampleCat.Rd +++ b/man/sampleCat.Rd @@ -24,6 +24,7 @@ Other imputation methods: \code{\link{matchImpute}()}, \code{\link{medianSamp}()}, \code{\link{rangerImpute}()}, -\code{\link{regressionImp}()} +\code{\link{regressionImp}()}, +\code{\link{xgboostImpute}()} } \concept{imputation methods} diff --git a/man/xgboostImpute.Rd b/man/xgboostImpute.Rd new file mode 100644 index 0000000..9e0a9c9 --- /dev/null +++ b/man/xgboostImpute.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboostImpute.R +\name{xgboostImpute} +\alias{xgboostImpute} +\title{Xgboost Imputation} +\usage{ +xgboostImpute( + formula, + data, + imp_var = TRUE, + imp_suffix = "imp", + verbose = FALSE, + nrounds = 100, + objective = NULL, + ... +) +} +\arguments{ +\item{formula}{model formula for the imputation} + +\item{data}{A \code{data.frame} containing the data} + +\item{imp_var}{\code{TRUE}/\code{FALSE} if a \code{TRUE}/\code{FALSE} variables for each imputed +variable should be created show the imputation status} + +\item{imp_suffix}{suffix used for TF imputation variables} + +\item{verbose}{Show the number of observations used for training +and evaluating the RF-Model. This parameter is also passed down to +\code{\link[xgboost:xgb.train]{xgboost::xgboost()}} to show computation status.} + +\item{nrounds}{max number of boosting iterations, +argument passed to \code{\link[xgboost:xgb.train]{xgboost::xgboost()}}} + +\item{objective}{objective for xgboost, +argument passed to \code{\link[xgboost:xgb.train]{xgboost::xgboost()}}} + +\item{...}{Arguments passed to \code{\link[xgboost:xgb.train]{xgboost::xgboost()}}} +} +\value{ +the imputed data set. +} +\description{ +Impute missing values based on a random forest model using \code{\link[xgboost:xgb.train]{xgboost::xgboost()}} +} +\examples{ +data(sleep) +xgboostImpute(Dream~BodyWgt+BrainWgt,data=sleep) +xgboostImpute(Dream+NonD~BodyWgt+BrainWgt,data=sleep) +xgboostImpute(Dream+NonD+Gest~BodyWgt+BrainWgt,data=sleep) + +sleepx <- sleep +sleepx$Pred <- as.factor(LETTERS[sleepx$Pred]) +sleepx$Pred[1] <- NA +xgboostImpute(Pred~BodyWgt+BrainWgt,data=sleepx) +} +\seealso{ +Other imputation methods: +\code{\link{hotdeck}()}, +\code{\link{impPCA}()}, +\code{\link{irmi}()}, +\code{\link{kNN}()}, +\code{\link{matchImpute}()}, +\code{\link{medianSamp}()}, +\code{\link{rangerImpute}()}, +\code{\link{regressionImp}()}, +\code{\link{sampleCat}()} +} +\concept{imputation methods} diff --git a/vignettes/irmi.Rmd b/vignettes/irmi.Rmd index f27bb94..a68cd87 100644 --- a/vignettes/irmi.Rmd +++ b/vignettes/irmi.Rmd @@ -23,7 +23,7 @@ In addition to Model based Imputation Methods (see `vignette("modelImp")`) the ` This vignette showcases the function `irmi()`. **IRMI** is short for **I**terative **R**obust **M**odel-based **I**mputation. This method can be used to generate imputations for several variables in a dataset. -Basically `irmi()` mimics the functionality of IVEWARE [(Raghunathan et al., 2001)](https://www.semanticscholar.org/paper/A-multivariate-technique-for-multiply-imputing-a-of-Raghunathan-Lepkowski/13b30e35b9a54dad07094cfe4f50d40ff15d8370?p2df), but there are several improvements with respect to the stability of the initialized values, or the robustness of the imputed values. In contrast to other imputation methods, the IRMI algorithm does not require at least one fully observed variable. In each step of the iteration, one variable is used as a response variable and the remaining variables serve as the regressors. Thus the "whole" multivariate information will be used for imputation in the response variable. For more details, please see [IRMI Imputation](http://file.statistik.tuwien.ac.at/filz/papers/CSDA11TKF.pdf). +Basically `irmi()` mimics the functionality of IVEWARE [(Raghunathan et al., 2001)](https://www150.statcan.gc.ca/n1/pub/12-001-x/2001001/article/5857-eng.pdf), but there are several improvements with respect to the stability of the initialized values, or the robustness of the imputed values. In contrast to other imputation methods, the IRMI algorithm does not require at least one fully observed variable. In each step of the iteration, one variable is used as a response variable and the remaining variables serve as the regressors. Thus the "whole" multivariate information will be used for imputation in the response variable. For more details, please see [IRMI Imputation](http://file.statistik.tuwien.ac.at/filz/papers/CSDA11TKF.pdf). ## Data