Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYSTEMDS-3153] Imputation for all missing columns and seed handling #1888

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 68 additions & 79 deletions scripts/builtin/imputeByKNN.dml
Original file line number Diff line number Diff line change
Expand Up @@ -26,43 +26,38 @@
# the missing values by column means. Currently, only the column with the most
# missing values is actually imputed.
#
# ------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------
# INPUT:
# ------------------------------------------------------------------------------
# X Matrix with missing values, which are represented as NaNs
# method Method used for imputing missing values with different performance
# and accuracy tradeoffs:
# 'dist' (default): Compute all-pairs distances and impute the
# missing values by closest. O(N^2 * #features)
# 'dist_missing': Compute distances between data and records with
# missing values. O(N*M * #features), assuming
# that the number of records with MV is M<<N.
# 'dist_sample': Compute distances between sample of data and
# records with missing values. O(S*M * #features)
# with M<<N and S<<N, but suboptimal imputation.
# seed Root seed value for random/sample calls for deterministic behavior
# -1 for true randomization
# ------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------
# X Matrix with missing values, which are represented as NaNs
# method Method used for imputing missing values with different performance
# and accuracy tradeoffs:
# 'dist' (default): Compute all-pairs distances and impute the
# missing values by closest. O(N^2 * #features)
# 'dist_missing': Compute distances between data and records with
# missing values. O(N*M * #features), assuming
# that the number of records with MV is M<<N.
# 'dist_sample': Compute distances between sample of data and
# records with missing values. O(S*M * #features)
# with M<<N and S<<N, but suboptimal imputation.
# seed Root seed value for random/sample calls for deterministic behavior
# -1 for true randomization
# sample_frac Size of sample records from the input matrix
# ---------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------
# result Imputed dataset
# ------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------

m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1,Int sample_frac = 1)
return(Matrix[Double] result)
{
#TODO fix seed handling (only root seed)
#TODO fix imputation for all columns with missing values

#KNN-Imputation Script

#Create a mask for placeholder and to check for missing values
masked = is.nan(X)

#Find the column containing multiple missing values
missing_col = rowIndexMax(colSums(is.nan(X)))

#Impute NaN value with temporary mean value of the column
filled_matrix = imputeByMean(X, matrix(0, cols = ncol(X), rows = 1))

Expand All @@ -76,30 +71,14 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
#Get the minimum distance row-wise computation
minimum_index = rowIndexMin(distance_matrix)

#Position of missing values in per row in which column
position = rowSums(is.nan(X))
position = position * minimum_index

#Filter the 0 out
I = (rowSums(is.nan(X))!=0)
missing = removeEmpty(target=position, margin="rows", select=I)
#Create aligned matrix from minimum index
aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))

#Convert the value indices into 0/1 matrix to find location
indices = table(missing, seq(1,nrow(filled_matrix)),odim1=nrow(filled_matrix),odim2=nrow(missing))
#Get the X records that need to be imputed
imputedValue = t(filled_matrix) %*% aligned

#Replace the index with value
imputedValue = t(indices) %*% filled_matrix[,as.scalar(missing_col)]

#Get the index location of the missing value
pos = rowSums(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos

#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)

#Replace the masked column with to be imputed Value
masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
#Update the mask value
masked = t(imputedValue) * masked
}
else if(method == "dist_missing") {
#assuming small missing values
Expand All @@ -116,64 +95,75 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
minD = rowIndexMin(t(D))

#Convert the value indices into 0/1 matrix to find location
indices = table(minD, seq(1,nrow(M2)),odim1=nrow(M2),odim2=nrow(minD))

#Replace the value
imputedValue = t(indices) %*% M2[,as.scalar(missing_col)]

#Get the index location of the missing value
pos = rowSums(is.nan(X))
pos = rowMaxs(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos

#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
R = table(I2,1,minD,odim1 = nrow(X), odim2=1)

#Update the masked value
masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
#Replace the 0 to avoid error in table()
R = replace(target = R, pattern = 0, replacement = nrow(X)+1)

#Create aligned matrix from minimum index
aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))

#Reshape the subset
reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))

#Get the M2 records that need to be imputed
imputedValue = t(reshaped) %*% aligned

#Update the mask value
masked = t(imputedValue) * masked
}
else if(method == "dist_sample"){
#assuming large missing values
#Split the matrix into containing NaN values (missing records) and not containing NaN values (M2 records)
I = (rowSums(is.nan(X))!=0)
missing = removeEmpty(target=filled_matrix, margin="rows", select=I)

Y = (rowSums(is.nan(X))==0)
M3 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)

#Create a random subset
random_matrix = ceiling(rand(rows = nrow(M3), cols = 1, min = 0, max = 1, sparsity = 0.5, seed = seed))
#Seed handling
nSeed = ifelse(seed == -1, seed, seed * nrow(missing))
sampling = sample(nrow(X),sample_frac, TRUE, seed = nSeed)

#ensure that random_matrix has at least 1 value
if(as.scalar(colSums(random_matrix)) < 1)
random_matrix = matrix(1, rows = nrow(M3), cols = 1)
#Create a permutation matrix
perm = table(sampling, seq(1,nrow(X)),odim1 = nrow(X), odim2 = nrow(X))

subset = M3 * random_matrix
subset = removeEmpty(target=subset, margin = "rows", select = random_matrix)
#Create the sample rows
value = t(filled_matrix) %*% perm
subset = t(value)
subset = removeEmpty(target=subset, margin = "rows")

#Calculate the euclidean distance between fully records and missing records, and then find the min value row wise
dotSubset = rowSums(subset * subset) %*% matrix(1, rows = 1, cols = nrow(missing))
dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = nrow(subset)))
D = sqrt(dotSubset + dotMissing - 2 * (subset %*% t(missing)))
minD = rowIndexMin(t(D))

#Convert the value indices into 0/1 matrix to find location
indices = table(minD, seq(1,nrow(subset)),odim1=nrow(subset),odim2=nrow(minD))

#Replace the value
imputedValue = t(indices) %*% subset[,as.scalar(missing_col)]

#Get the index location of the missing value
pos = rowSums(is.nan(X))
pos = rowMaxs(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos

#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
R = table(I2,1,minD,odim1 = nrow(X), odim2=1)

#Replace the 0 to avoid error in table()
R = replace(target = R, pattern = 0, replacement = nrow(X)+1)

#Create aligned matrix from minimum index
aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))

#Update the masked value
masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
#Reshape the subset
reshaped = rbind(subset, matrix(0, rows = nrow(X) - nrow(subset), cols = ncol(X)))

#Get the subset records that need to be imputed
imputedValue = t(reshaped) %*% aligned

#Update the mask value
masked = t(imputedValue) * masked
}
else {
print("Method is unknown or not yet implemented")
Expand All @@ -182,5 +172,4 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
#Impute the value
result = replace(target = X, pattern = NaN, replacement = 0)
result = result + masked
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@ private void runImputeKNN(boolean defaultProb, ExecType instType) throws IOExcep
rtplatform = platform_old;
}
}
}
}
10 changes: 4 additions & 6 deletions src/test/scripts/functions/builtin/imputeByKNN.dml
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,12 @@
# Prepare the data
X = read($1, data_type="frame", format="csv", header=TRUE, naStrings= ["20"]);
X = cbind(as.matrix(X[,4:5]), as.matrix(X[,7]))
remove_col = is.nan(X)

data = removeEmpty(target = X, margin = "rows", select = (remove_col[,1] != 1))
mask = is.nan(data)
mask = is.nan(X)

#Perform the KNN imputation
result = imputeByKNN(X = data, method = $2)
result2 = imputeByKNN(X = data, method = $3)
result = imputeByKNN(X = X, method = $2)
result2 = imputeByKNN(X = X, method = $3)

#Get the imputed value
I = (mask[,2] == 1);
Expand All @@ -41,4 +39,4 @@ value = colSums(value[,2])
value2 = colSums(value2[,2])

write(value, $4)
write(value2, $5)
write(value2, $5)
Loading