apache · msanyoto · Aug 6, 2023 · Aug 6, 2023 · Aug 9, 2023 · Aug 9, 2023
diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml
@@ -26,43 +26,38 @@
 # the missing values by column means. Currently, only the column with the most
 # missing values is actually imputed.
 #
-# ------------------------------------------------------------------------------
+# ---------------------------------------------------------------------------------
 # INPUT:
-# ------------------------------------------------------------------------------
-# X          Matrix with missing values, which are represented as NaNs
-# method     Method used for imputing missing values with different performance
-#            and accuracy tradeoffs:
-#            'dist' (default): Compute all-pairs distances and impute the
-#                              missing values by closest. O(N^2 * #features)
-#            'dist_missing':   Compute distances between data and records with
-#                              missing values. O(N*M * #features), assuming
-#                              that the number of records with MV is M<<N.
-#            'dist_sample':    Compute distances between sample of data and
-#                              records with missing values. O(S*M * #features)
-#                              with M<<N and S<<N, but suboptimal imputation.
-# seed       Root seed value for random/sample calls for deterministic behavior
-#            -1 for true randomization
-# ------------------------------------------------------------------------------
+# ---------------------------------------------------------------------------------
+# X            Matrix with missing values, which are represented as NaNs
+# method       Method used for imputing missing values with different performance
+#              and accuracy tradeoffs:
+#              'dist' (default): Compute all-pairs distances and impute the
+#                                missing values by closest. O(N^2 * #features)
+#              'dist_missing':   Compute distances between data and records with
+#                                missing values. O(N*M * #features), assuming
+#                                that the number of records with MV is M<<N.
+#              'dist_sample':    Compute distances between sample of data and
+#                                records with missing values. O(S*M * #features)
+#                                with M<<N and S<<N, but suboptimal imputation.
+# seed          Root seed value for random/sample calls for deterministic behavior
+#               -1 for true randomization
+# sample_frac   Size of sample records from the input matrix
+# ---------------------------------------------------------------------------------
 #
 # OUTPUT:
-# ------------------------------------------------------------------------------
+# ---------------------------------------------------------------------------------
 # result     Imputed dataset
-# ------------------------------------------------------------------------------
+# ---------------------------------------------------------------------------------
 
-m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
+m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1,Int sample_frac = 1)
   return(Matrix[Double] result)
 {
-  #TODO fix seed handling (only root seed)
-  #TODO fix imputation for all columns with missing values
-
   #KNN-Imputation Script
 
   #Create a mask for placeholder and to check for missing values
   masked = is.nan(X)
 
-  #Find the column containing multiple missing values
-  missing_col = rowIndexMax(colSums(is.nan(X)))
-
   #Impute NaN value with temporary mean value of the column
   filled_matrix = imputeByMean(X, matrix(0, cols = ncol(X), rows = 1))
 
@@ -76,30 +71,14 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
     #Get the minimum distance row-wise computation
     minimum_index = rowIndexMin(distance_matrix)
 
-    #Position of missing values in per row in which column
-    position = rowSums(is.nan(X))
-    position = position * minimum_index
-
-    #Filter the 0 out
-    I = (rowSums(is.nan(X))!=0)
-    missing = removeEmpty(target=position, margin="rows", select=I)
+    #Create aligned matrix from minimum index
+    aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
 
-    #Convert the value indices into 0/1 matrix to find location
-    indices = table(missing, seq(1,nrow(filled_matrix)),odim1=nrow(filled_matrix),odim2=nrow(missing))
+    #Get the X records that need to be imputed
+    imputedValue = t(filled_matrix) %*% aligned
 
-    #Replace the index with value
-    imputedValue = t(indices) %*% filled_matrix[,as.scalar(missing_col)]
-
-    #Get the index location of the missing value
-    pos = rowSums(is.nan(X))
-    missing_indices = seq(1, nrow(pos)) * pos
-
-    #Put the replacement value in the missing indices
-    I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
-
-    #Replace the masked column with to be imputed Value
-    masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+    #Update the mask value
+    masked = t(imputedValue) * masked
   }
   else if(method == "dist_missing") {
     #assuming small missing values
@@ -116,64 +95,75 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
     D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
     minD = rowIndexMin(t(D))
 
-    #Convert the value indices into 0/1 matrix to find location
-    indices = table(minD, seq(1,nrow(M2)),odim1=nrow(M2),odim2=nrow(minD))
-
-    #Replace the value
-    imputedValue = t(indices) %*% M2[,as.scalar(missing_col)]
-
     #Get the index location of the missing value
-    pos = rowSums(is.nan(X))
+    pos = rowMaxs(is.nan(X))
     missing_indices = seq(1, nrow(pos)) * pos
 
     #Put the replacement value in the missing indices
     I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+    R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
 
-    #Update the masked value
-    masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+    #Replace the 0 to avoid error in table()
+    R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
+
+    #Create aligned matrix from minimum index
+    aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
+
+    #Reshape the subset
+    reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
+
+    #Get the M2 records that need to be imputed
+    imputedValue = t(reshaped) %*% aligned
+
+    #Update the mask value
+    masked = t(imputedValue) * masked
   }
   else if(method == "dist_sample"){
     #assuming large missing values
     #Split the matrix into containing NaN values (missing records) and not containing NaN values (M2 records)
     I = (rowSums(is.nan(X))!=0)
     missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
 
-    Y = (rowSums(is.nan(X))==0)
-    M3 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
-    #Create a random subset
-    random_matrix = ceiling(rand(rows = nrow(M3), cols = 1, min = 0, max = 1, sparsity = 0.5, seed = seed))
+    #Seed handling
+    nSeed = ifelse(seed == -1, seed, seed * nrow(missing))
+    sampling = sample(nrow(X),sample_frac, TRUE, seed = nSeed)
 
-    #ensure that random_matrix has at least 1 value
-    if(as.scalar(colSums(random_matrix)) < 1)
-      random_matrix = matrix(1, rows = nrow(M3), cols = 1)
+    #Create a permutation matrix
+    perm = table(sampling, seq(1,nrow(X)),odim1 = nrow(X), odim2 = nrow(X))
 
-    subset = M3 * random_matrix
-    subset = removeEmpty(target=subset, margin = "rows", select = random_matrix)
+    #Create the sample rows
+    value = t(filled_matrix) %*% perm
+    subset = t(value)
+    subset = removeEmpty(target=subset, margin = "rows")
 
     #Calculate the euclidean distance between fully records and missing records, and then find the min value row wise
     dotSubset = rowSums(subset * subset) %*% matrix(1, rows = 1, cols = nrow(missing))
     dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = nrow(subset)))
     D = sqrt(dotSubset + dotMissing - 2 * (subset %*% t(missing)))
     minD = rowIndexMin(t(D))
 
-    #Convert the value indices into 0/1 matrix to find location
-    indices = table(minD, seq(1,nrow(subset)),odim1=nrow(subset),odim2=nrow(minD))
-
-    #Replace the value
-    imputedValue = t(indices) %*% subset[,as.scalar(missing_col)]
-
     #Get the index location of the missing value
-    pos = rowSums(is.nan(X))
+    pos = rowMaxs(is.nan(X))
     missing_indices = seq(1, nrow(pos)) * pos
 
     #Put the replacement value in the missing indices
     I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+    R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
+
+    #Replace the 0 to avoid error in table()
+    R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
+
+    #Create aligned matrix from minimum index
+    aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
 
-    #Update the masked value
-    masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+    #Reshape the subset
+    reshaped = rbind(subset, matrix(0, rows = nrow(X) - nrow(subset), cols = ncol(X)))
+
+    #Get the subset records that need to be imputed
+    imputedValue = t(reshaped) %*% aligned
+
+    #Update the mask value
+    masked = t(imputedValue) * masked
   }
   else {
     print("Method is unknown or not yet implemented")
@@ -182,5 +172,4 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
   #Impute the value
   result = replace(target = X, pattern = NaN, replacement = 0)
   result = result + masked
-}
-
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImputeKNNTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImputeKNNTest.java
@@ -74,4 +74,4 @@ private void runImputeKNN(boolean defaultProb, ExecType instType) throws IOExcep
             rtplatform = platform_old;
         }
     }
-}
+}
diff --git a/src/test/scripts/functions/builtin/imputeByKNN.dml b/src/test/scripts/functions/builtin/imputeByKNN.dml
@@ -22,14 +22,12 @@
 # Prepare the data
 X = read($1, data_type="frame", format="csv", header=TRUE, naStrings= ["20"]);
 X = cbind(as.matrix(X[,4:5]), as.matrix(X[,7]))
-remove_col = is.nan(X)
 
-data = removeEmpty(target = X, margin = "rows", select = (remove_col[,1] != 1))
-mask = is.nan(data)
+mask = is.nan(X)
 
 #Perform the KNN imputation
-result = imputeByKNN(X = data, method = $2)
-result2 = imputeByKNN(X = data, method = $3)
+result = imputeByKNN(X = X, method = $2)
+result2 = imputeByKNN(X = X, method = $3)
 
 #Get the imputed value
 I = (mask[,2] == 1);
@@ -41,4 +39,4 @@ value = colSums(value[,2])
 value2 = colSums(value2[,2])
 
 write(value, $4)
-write(value2, $5)
+write(value2, $5)