Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYSTEMDS-3153] Missing value imputation using KNN #1879

Closed
wants to merge 8 commits into from
179 changes: 179 additions & 0 deletions scripts/builtin/imputeByKNN.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# Impute the data by using KNN-algorithm and finding the nearest neighbor using euclidean distance
# method. To avoid NaN value during distance calculation, NaN values are temporarily replaced
# with mean value of the respective column using ImputeByMean method.
# Currently only for single column with multiple missing values and top 1 neighbor
# Methods currently implemented
# dist - Default method. Really small, coarse-grained operation.
# Distance computation and complexity O(n2 * #features)
# dist_missing - Assuming the missing value is small. Compute the euclidean distances between
# entire "X" (potentially large) and rows with missing values.
# dist_sample - Assuming the missing value is large. Compute the euclidean distances between
# sample of rows of X (randomly selected and act as control) and rows with
# missing values (if not small).
#
# INPUT:
# --------------------------------------------------------------------------------------------------
# X Data Matrix with missing values (numerical features)
# method Methods of calculating the KNN-algorithm depending on the size of data and missing values
# seed Set seed value for random/sample calls to ensure deterministic behavior when needed
# --------------------------------------------------------------------------------------------------
#
# OUTPUT:
# --------------------------------------------------------------------------------------------------
# result imputed dataset
# --------------------------------------------------------------------------------------------------

m_imputeByKNN = function(Matrix[Double] X, String method = "dist", Integer seed = 33)
return(Matrix[Double] result)
{
#KNN-Imputation Script

#Create a mask for placeholder and to check for missing values
masked = is.nan(X)

#Find the column containing multiple missing values
missing_col = rowIndexMax(colSums(is.nan(X)))

#Impute NaN value with temporary mean value of the column
filled_matrix = imputeByMean(X, matrix(0, cols = ncol(X), rows = 1))

if(method == "dist"){
#default method
#Calculate the distance using dist method after imputation with mean
distance_matrix = dist(filled_matrix)

#Change 0 value so rowIndexMin will ignore that diagonal value
distance_matrix = replace(target = distance_matrix, pattern = 0, replacement = 999)

#Get the minimum distance row-wise computation
minimum_index = rowIndexMin(distance_matrix)

#Position of missing values in per row in which column
position = rowSums(is.nan(X))
position = position * minimum_index

#Filter the 0 out
I = (rowSums(is.nan(X))!=0)
missing = removeEmpty(target=position, margin="rows", select=I)

#Convert the value indices into 0/1 matrix to find location
indices = table(missing, seq(1,nrow(filled_matrix)),odim1=nrow(filled_matrix),odim2=nrow(missing))

#Replace the index with value
imputedValue = t(indices) %*% filled_matrix[,as.scalar(missing_col)]

#Get the index location of the missing value
pos = rowSums(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos

#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)

#Replace the masked column with to be imputed Value
masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R

} else if(method == "dist_missing"){
#assuming small missing values
#Split the matrix into containing NaN values (missing records) and not containing NaN values (M2 records)
I = (rowSums(is.nan(X))!=0)
missing = removeEmpty(target=filled_matrix, margin="rows", select=I)

Y = (rowSums(is.nan(X))==0)
M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)

#Calculate the euclidean distance between fully records and missing records, and then find the min value row wise
dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing))
dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = nrow(M2)))
D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
minD = rowIndexMin(t(D))

#Convert the value indices into 0/1 matrix to find location
indices = table(minD, seq(1,nrow(M2)),odim1=nrow(M2),odim2=nrow(minD))

#Replace the value
imputedValue = t(indices) %*% M2[,as.scalar(missing_col)]

#Get the index location of the missing value
pos = rowSums(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos

#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)

#Update the masked value
masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R

} else if(method == "dist_sample"){
#assuming large missing values
#Split the matrix into containing NaN values (missing records) and not containing NaN values (M2 records)
I = (rowSums(is.nan(X))!=0)
missing = removeEmpty(target=filled_matrix, margin="rows", select=I)

Y = (rowSums(is.nan(X))==0)
M3 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)

#Create a random subset
random_matrix = ceiling(rand(rows = nrow(M3), cols = 1, min = 0, max = 1, sparsity = 0.5, seed = seed))

#ensure that random_matrix has at least 1 value
if(as.scalar(colSums(random_matrix)) < 1) {
random_matrix = matrix(1, rows = nrow(M3), cols = 1)
}

subset = M3 * random_matrix
subset = removeEmpty(target=subset, margin = "rows", select = random_matrix)

#Calculate the euclidean distance between fully records and missing records, and then find the min value row wise
dotSubset = rowSums(subset * subset) %*% matrix(1, rows = 1, cols = nrow(missing))
dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = nrow(subset)))
D = sqrt(dotSubset + dotMissing - 2 * (subset %*% t(missing)))
minD = rowIndexMin(t(D))

#Convert the value indices into 0/1 matrix to find location
indices = table(minD, seq(1,nrow(subset)),odim1=nrow(subset),odim2=nrow(minD))

#Replace the value
imputedValue = t(indices) %*% subset[,as.scalar(missing_col)]

#Get the index location of the missing value
pos = rowSums(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos

#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)

#Update the masked value
masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R

} else {
print("Method is unknown or not yet implemented")
}

#Impute the value
result = replace(target = X, pattern = NaN, replacement = 0)
result = result + masked
}
1 change: 1 addition & 0 deletions src/main/java/org/apache/sysds/common/Builtins.java
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ public enum Builtins {
IMG_INVERT("img_invert", true),
IMG_POSTERIZE("img_posterize", true),
IMPURITY_MEASURES("impurityMeasures", true),
IMPUTE_BY_KNN("imputeByKNN", true),
IMPUTE_BY_MEAN("imputeByMean", true),
IMPUTE_BY_MEAN_APPLY("imputeByMeanApply", true),
IMPUTE_BY_MEDIAN("imputeByMedian", true),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.functions.builtin.part1;

import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types;
import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.junit.Test;

import java.io.IOException;
import java.util.HashMap;

public class BuiltinImputeKNNTest extends AutomatedTestBase {

private final static String TEST_NAME = "imputeByKNN";
private final static String TEST_DIR = "functions/builtin/";
private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinImputeKNNTest.class.getSimpleName() + "/";

private double eps = 10;
@Override
public void setUp() {
TestUtils.clearAssertionInformation();
addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"B","B2"}));
}

@Test
public void testDefaultCP()throws IOException{
runImputeKNN(true, Types.ExecType.CP);
}

@Test
public void testDefaultSpark()throws IOException{
runImputeKNN(true, Types.ExecType.SPARK);
}

private void runImputeKNN(boolean defaultProb, Types.ExecType instType) throws IOException {
Types.ExecMode platform_old = setExecMode(instType);
try {
loadTestConfiguration(getTestConfiguration(TEST_NAME));
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[] {"-args", DATASET_DIR+"Salaries.csv", "dist","dist_missing", output("B"),output("B2")};

runTest(true, false, null, -1);

//Compare matrices, check if the sum of the imputed value is roughly the same
HashMap<MatrixValue.CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("B");
HashMap<MatrixValue.CellIndex, Double> dmlfile2 = readDMLMatrixFromOutputDir("B2");
TestUtils.compareMatrices(dmlfile,dmlfile2,eps,"dist","dist_sample");
} finally {
rtplatform = platform_old;
}
}
}
43 changes: 43 additions & 0 deletions src/test/scripts/functions/builtin/imputeByKNN.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

# Prepare the data
X = read($1, data_type="frame", format="csv", header=TRUE, naStrings= ["20"]);
X = as.matrix(cbind(X[,4],X[,5]))
remove_col = is.nan(X)

data = removeEmpty(target = X, margin = "rows", select = (remove_col[,1] != 1))
mask = is.nan(data)

#Perform the KNN imputation
result = imputeByKNN(X = data, method = $2)
result2 = imputeByKNN(X = data, method = $3)

#Get the imputed value
value = removeEmpty(target = result, margin = "rows", select = mask[,2] == 1)
value2 = removeEmpty(target = result2, margin = "rows", select = mask[,2] == 1)

#Get the sum of the imputed value
value = colSums(value[,2])
value2 = colSums(value2[,2])

write(value, $4)
write(value2,$5)
Loading