diff --git a/src/main/java/org/apache/sysds/common/Types.java b/src/main/java/org/apache/sysds/common/Types.java
index 78ed173bc9e..7bf48f5da5e 100644
--- a/src/main/java/org/apache/sysds/common/Types.java
+++ b/src/main/java/org/apache/sysds/common/Types.java
@@ -213,6 +213,7 @@ public enum BlockType{
 		ULTRA_SPARSE_BLOCK,
 		SPARSE_BLOCK,
 		DENSE_BLOCK,
+		DEDUP_BLOCK,
 	}
 	
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java b/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java
index b2273faa75b..e9428eb3714 100644
--- a/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java
+++ b/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java
@@ -19,236 +19,281 @@
 
 package org.apache.sysds.runtime.data;
 
-import org.apache.commons.lang3.NotImplementedException;
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.common.Types;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.utils.MemoryEstimates;
 
 import java.util.Arrays;
 import java.util.HashMap;
 
 public class DenseBlockFP64DEDUP extends DenseBlockDRB{
-	private static final long serialVersionUID = 4124905190428752213L;
-	private double[][] _data;
-
-	protected DenseBlockFP64DEDUP(int[] dims) {
-		super(dims);
-		reset(_rlen, _odims, 0);
-	}
-
-	@Override
-	protected void allocateBlock(int bix, int length) {
-		_data[bix] = new double[length];
-	}
-
-	@Override
-	public void reset(int rlen, int[] odims, double v) {
-		if(rlen >  capacity() / _odims[0])
-			_data = new double[rlen][];
-		else{
-			if(v == 0.0){
-				for(int i = 0; i < rlen; i++)
-					_data[i] = null;
-			}
-			else {
-				for(int i = 0; i < rlen; i++){
-					if(odims[0] > _odims[0] ||_data[i] == null )
-						allocateBlock(i, odims[0]);
-					Arrays.fill(_data[i], 0, odims[0], v);
-				}
-			}
-		}
-		_rlen = rlen;
-		_odims = odims;
-	}
-
-	@Override
-	public void resetNoFill(int rlen, int[] odims) {
-		if(_data == null || rlen > _rlen){
-			_data = new double[rlen][];
-		}
-		_rlen = rlen;
-		_odims = odims;
-	}
-
-	@Override
-	public boolean isNumeric() {
-		return true;
-	}
-
-	@Override
-	public boolean isNumeric(Types.ValueType vt) {
-		return Types.ValueType.FP64 == vt;
-	}
-
-	@Override
-	public long capacity() {
-		return (_data != null) ? _data.length*_odims[0] : -1;
-	}
-
-	@Override
-	public long countNonZeros(){
-		long nnz = 0;
-		HashMap<double[], Long> cache = new HashMap<>();
-		for (int i = 0; i < _rlen; i++) {
-			double[] row = this._data[i];
-			if(row == null)
-				continue;
-			Long count = cache.getOrDefault(row, null);
-			if(count == null){
-				count = Long.valueOf(countNonZeros(i));
-				cache.put(row, count);
-			}
-			nnz += count;
-		}
-		return nnz;
-	}
-
-	@Override
-	public int countNonZeros(int r) {
-		return _data[r] == null ? 0 : UtilFunctions.computeNnz(_data[r], 0, _odims[0]);
-	}
-
-	@Override
-	protected long computeNnz(int bix, int start, int length) {
-		int nnz = 0;
-		int row_start = (int) Math.floor(start / _odims[0]);
-		int col_start = start % _odims[0];
-		for (int i = 0; i < length; i++) {
-			if(_data[row_start] == null){
-				i += _odims[0] - 1 - col_start;
-				col_start = 0;
-				row_start += 1;
-				continue;
-			}
-			nnz += _data[row_start][col_start] != 0 ? 1 : 0;
-			col_start += 1;
-			if(col_start == _odims[0]) {
-				col_start = 0;
-				row_start += 1;
-			}
-		}
-		return nnz;
-	}
-
-	@Override
-	public int pos(int r){
-		return 0;
-	}
-
-	@Override
-	public int pos(int[] ix){
-		int pos = ix[ix.length - 1];
-		for(int i = 1; i < ix.length - 1; i++)
-			pos += ix[i] * _odims[i];
-		return pos;
-	}
-
-	@Override
-	public double[] values(int r) {
-		return valuesAt(r);
-	}
-
-	@Override
-	public double[] valuesAt(int bix) {
-		return _data[bix] == null ? new double[_odims[0]] : _data[bix];
-	}
-
-	@Override
-	public int index(int r) {
-		return r;
-	}
-
-	@Override
-	public int numBlocks(){
-		return _data.length;
-	}
-
-	@Override
-	public int size(int bix) {
-		return _odims[0];
-	}
-
-	@Override
-	public void incr(int r, int c) {
-		incr(r,c,1.0);
-	}
-
-	@Override
-	public void incr(int r, int c, double delta) {
-		if(_data[r] == null)
-			allocateBlock(r, _odims[0]);
-		_data[r][c] += delta;
-	}
-
-	@Override
-	protected void fillBlock(int bix, int fromIndex, int toIndex, double v) {
-		if(_data[bix] == null)
-			allocateBlock(bix, _odims[0]);
-		Arrays.fill(_data[bix], fromIndex, toIndex, v);
-	}
-
-	@Override
-	protected void setInternal(int bix, int ix, double v) {
-		set(bix, ix, v);
-	}
-
-	@Override
-	public DenseBlock set(int r, int c, double v) {
-		if(_data[r] == null)
-			_data[r] = new double[_odims[0]];
-		_data[r][c] = v;
-		return this;
-	}
-
-	@Override
-	public DenseBlock set(int r, double[] v) {
-		if(v.length == _odims[0])
-			_data[r] = v;
-		else
-			throw new RuntimeException("set Denseblock called with an array length [" + v.length +"], array to overwrite is of length [" + _odims[0] + "]");
-		return this;
-	}
-
-	@Override
-	public DenseBlock set(DenseBlock db) {
-		throw new NotImplementedException();
-	}
-
-	@Override
-	public DenseBlock set(int[] ix, double v) {
-		return set(ix[0], pos(ix), v);
-	}
-
-	@Override
-	public DenseBlock set(int[] ix, long v) {
-		return set(ix[0], pos(ix), v);
-	}
-
-	@Override
-	public DenseBlock set(int[] ix, String v) {
-		return set(ix[0], pos(ix), Double.parseDouble(v));
-	}
-
-	@Override
-	public double get(int r, int c) {
-		if(_data[r] == null)
-			return 0.0;
-		else
-			return _data[r][c];
-	}
-
-	@Override
-	public double get(int[] ix) {
-		return get(ix[0], pos(ix));
-	}
-
-	@Override
-	public String getString(int[] ix) {
-		return String.valueOf(get(ix[0], pos(ix)));
-	}
-
-	@Override
-	public long getLong(int[] ix) {
-		return UtilFunctions.toLong(get(ix[0], pos(ix)));
-	}
+    private double[][] _data;
+    private int _distinct = 0;
+
+    protected DenseBlockFP64DEDUP(int[] dims) {
+        super(dims);
+        reset(_rlen, _odims, 0);
+    }
+
+    public int getNrDistinctRows(){
+        return _distinct;
+    }
+
+    @Override
+    protected void allocateBlock(int bix, int length) {
+        _data[bix] = new double[length];
+    }
+
+    @Override
+    public void reset(int rlen, int[] odims, double v) {
+        if(rlen >  capacity() / _odims[0])
+            _data = new double[rlen][];
+        else{
+            if(v == 0.0){
+                for(int i = 0; i < rlen; i++)
+                    _data[i] = null;
+            }
+            else {
+                for(int i = 0; i < rlen; i++){
+                    if(odims[0] > _odims[0] ||_data[i] == null )
+                        allocateBlock(i, odims[0]);
+                    Arrays.fill(_data[i], 0, odims[0], v);
+                }
+            }
+        }
+        _rlen = rlen;
+        _odims = odims;
+    }
+
+    @Override
+    public void resetNoFill(int rlen, int[] odims) {
+        if(_data == null || rlen > _rlen){
+            _data = new double[rlen][];
+        }
+        _rlen = rlen;
+        _odims = odims;
+    }
+
+    @Override
+    public boolean isNumeric() {
+        return true;
+    }
+
+    @Override
+    public boolean isNumeric(Types.ValueType vt) {
+        return Types.ValueType.FP64 == vt;
+    }
+
+    @Override
+    public long capacity() {
+        return (_data != null) ? ((long) _data.length)*_odims[0] : -1;
+    }
+
+
+
+    @Override
+    public long countNonZeros(){
+        long nnz = 0;
+        HashMap<double[], Long> cache = new HashMap<double[], Long>();
+        for (int i = 0; i < _rlen; i++) {
+            double[] row = this._data[i];
+            if(row == null)
+                continue;
+            Long count = cache.getOrDefault(row, null);
+            if(count == null){
+                count = Long.valueOf(countNonZeros(i));
+                cache.put(row, count);
+            }
+            nnz += count;
+        }
+        this._distinct = cache.size();
+        return nnz;
+    }
+
+    @Override
+    public int countNonZeros(int r) {
+        return _data[r] == null ? 0 : UtilFunctions.computeNnz(_data[r], 0, _odims[0]);
+    }
+
+    @Override
+    public long countNonZeros(int rl, int ru, int ol, int ou) {
+        long nnz = 0;
+        HashMap<double[], Long> cache = new HashMap<double[], Long>();
+        for (int i = rl; i < ru; i++) {
+            double[] row = this._data[i];
+            if(row == null)
+                continue;
+            Long count = cache.getOrDefault(row, null);
+            if(count == null){
+                count = Long.valueOf(UtilFunctions.computeNnz(_data[i], ol, ou));
+                cache.put(row, count);
+            }
+            nnz += count;
+        }
+        return nnz;
+    }
+
+    @Override
+    protected long computeNnz(int bix, int start, int length) {
+        int nnz = 0;
+        int row_start = (int) Math.floor(start / _odims[0]);
+        int col_start = start % _odims[0];
+        for (int i = 0; i < length; i++) {
+            if(_data[row_start] == null){
+                i += _odims[0] - 1 - col_start;
+                col_start = 0;
+                row_start += 1;
+                continue;
+            }
+            nnz += _data[row_start][col_start] != 0 ? 1 : 0;
+            col_start += 1;
+            if(col_start == _odims[0]) {
+                col_start = 0;
+                row_start += 1;
+            }
+        }
+        return nnz;
+    }
+
+    @Override
+    public int pos(int r){
+        return 0;
+    }
+
+    @Override
+    public int pos(int r, int c){
+        return c;
+    }
+
+    @Override
+    public int pos(int[] ix){
+        int pos = ix[ix.length - 1];
+        for(int i = 1; i < ix.length - 1; i++)
+            pos += ix[i] * _odims[i];
+        return pos;
+    }
+
+    @Override
+    public int blockSize(int bix) {
+        return 1;
+    }
+    public boolean isContiguous(int rl, int ru){
+        return rl == ru;
+    }
+    @Override
+    public double[] values(int r) {
+        return valuesAt(r);
+    }
+
+    @Override
+    public double[] valuesAt(int bix) {
+        return _data[bix] == null ? new double[_odims[0]] : _data[bix];
+    }
+
+    @Override
+    public int index(int r) {
+        return r;
+    }
+
+    @Override
+    public int numBlocks(){
+        return _data.length;
+    }
+
+    @Override
+    public int size(int bix) {
+        return _odims[0];
+    }
+
+    @Override
+    public void incr(int r, int c) {
+        incr(r,c,1.0);
+    }
+
+    @Override
+    public void incr(int r, int c, double delta) {
+        if(_data[r] == null)
+            allocateBlock(r, _odims[0]);
+        _data[r][c] += delta;
+    }
+
+    @Override
+    protected void fillBlock(int bix, int fromIndex, int toIndex, double v) {
+        if(_data[bix] == null)
+            allocateBlock(bix, _odims[0]);
+        Arrays.fill(_data[bix], fromIndex, toIndex, v);
+    }
+
+    @Override
+    protected void setInternal(int bix, int ix, double v) {
+        set(bix, ix, v);
+    }
+
+    @Override
+    public DenseBlock set(int r, int c, double v) {
+        if(_data[r] == null)
+            _data[r] = new double[_odims[0]];
+        _data[r][c] = v;
+        return this;
+    }
+
+    @Override
+    public DenseBlock set(int r, double[] v) {
+        if(v.length == _odims[0])
+            _data[r] = v;
+        else
+            throw new RuntimeException("set Denseblock called with an array length [" + v.length +"], array to overwrite is of length [" + _odims[0] + "]");
+        return this;
+    }
+
+    @Override
+    public DenseBlock set(DenseBlock db) {
+        throw new NotImplementedException();
+    }
+
+    @Override
+    public DenseBlock set(int[] ix, double v) {
+        return set(ix[0], pos(ix), v);
+    }
+
+    @Override
+    public DenseBlock set(int[] ix, long v) {
+        return set(ix[0], pos(ix), v);
+    }
+
+    @Override
+    public DenseBlock set(int[] ix, String v) {
+        return set(ix[0], pos(ix), Double.parseDouble(v));
+    }
+
+    @Override
+    public double get(int r, int c) {
+        if(_data[r] == null)
+            return 0.0;
+        else
+            return _data[r][c];
+    }
+
+    @Override
+    public double get(int[] ix) {
+        return get(ix[0], pos(ix));
+    }
+
+    @Override
+    public String getString(int[] ix) {
+        return String.valueOf(get(ix[0], pos(ix)));
+    }
+
+    @Override
+    public long getLong(int[] ix) {
+        return UtilFunctions.toLong(get(ix[0], pos(ix)));
+    }
+
+    public double estimateMemory(){
+        if( (double)_rlen + this._odims[0] > Long.MAX_VALUE )
+            return Long.MAX_VALUE;
+        return DenseBlock.estimateMemory(_rlen, _odims[0])
+                + MemoryEstimates.doubleArrayCost(_odims[0])*_distinct + MemoryEstimates.objectArrayCost(_rlen);
+    }
 }
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java
index 2378f73169c..216aa533c56 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java
@@ -22,6 +22,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@@ -34,6 +35,7 @@
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP;
 import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockCSR;
@@ -1823,12 +1825,28 @@ else if( ixFn instanceof ReduceRow ) //COLVAR
 	 * @param ru row upper index
 	 */
 	private static void d_uakp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru ) {
-		final int bil = a.index(rl);
-		final int biu = a.index(ru-1);
-		for(int bi=bil; bi<=biu; bi++) {
-			int lpos = (bi==bil) ? a.pos(rl) : 0;
-			int len = (bi==biu) ? a.pos(ru-1)-lpos+n : a.blockSize(bi)*n;
-			sum(a.valuesAt(bi), lpos, len, kbuff, kplus);
+		if(a instanceof DenseBlockFP64DEDUP){
+			HashMap<double[], Integer> counts = new HashMap<>();
+			for( int i=rl; i<ru; i++ ) {
+				double[] row = a.values(i);
+				Integer count = counts.getOrDefault(row, 0);
+				count += 1;
+				counts.put(row, count);
+			}
+			counts.forEach((row, count) -> {
+				for (int i = 0; i < row.length; i++) {
+					kplus.execute3(kbuff, row[i], count);
+				}
+			});
+		}
+		else {
+			final int bil = a.index(rl);
+			final int biu = a.index(ru - 1);
+			for (int bi = bil; bi <= biu; bi++) {
+				int lpos = (bi == bil) ? a.pos(rl) : 0;
+				int len = (bi == biu) ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n;
+				sum(a.valuesAt(bi), lpos, len, kbuff, kplus);
+			}
 		}
 		c.set(kbuff);
 	}
@@ -1846,10 +1864,27 @@ private static void d_uakp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff
 	 */
 	private static void d_uarkp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru ) 
 	{
-		for( int i=rl; i<ru; i++ ) {
-			kbuff.set(0, 0); //reset buffer
-			sum( a.values(i), a.pos(i), n, kbuff, kplus );
-			c.set(i, kbuff);
+		if(a instanceof DenseBlockFP64DEDUP){
+			HashMap<double[], double[]> cache = new HashMap<>();
+			for( int i=rl; i<ru; i++ ) {
+				double[] row = a.values(i);
+				int finalI = i;
+				double[] kbuff_array = cache.computeIfAbsent(row, lambda_row -> {
+					kbuff.set(0, 0);
+					sum(lambda_row , a.pos(finalI), n, kbuff, kplus );
+					return new double[] {kbuff._sum, kbuff._correction};
+				});
+				cache.putIfAbsent(row, kbuff_array);
+				c.set(i, 0, kbuff_array[0]);
+				c.set(i, 1, kbuff_array[1]);
+			}
+		}
+		else {
+			for (int i = rl; i < ru; i++) {
+				kbuff.set(0, 0); //reset buffer
+				sum(a.values(i), a.pos(i), n, kbuff, kplus);
+				c.set(i, kbuff);
+			}
 		}
 	}
 	
@@ -1865,8 +1900,43 @@ private static void d_uarkp( DenseBlock a, DenseBlock c, int n, KahanObject kbuf
 	 * @param ru row upper index
 	 */
 	private static void d_uackp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru ) {
-		for( int i=rl; i<ru; i++ )
-			sumAgg( a.values(i), c, a.pos(i), n, kbuff, kplus );
+		if(a instanceof DenseBlockFP64DEDUP){
+			HashMap<double[], Integer> counts = new HashMap<>();
+			for( int i=rl; i<ru; i++ ) {
+				double[] row = a.values(i);
+				Integer count = counts.getOrDefault(row, 0);
+				count += 1;
+				counts.put(row, count);
+			}
+			double[] sum = new double[n];
+			double[] corr = new double[n];
+			counts.forEach((row, count) -> {
+				for (int i = 0; i < row.length; i++) {
+					kbuff._sum = sum[i];
+					kbuff._correction = corr[i];
+					kplus.execute3(kbuff, row[i], count);
+					sum[i] = kbuff._sum;
+					corr[i] = kbuff._correction;
+				}
+			});
+			double[] out_sum = c.values(0);
+			double[] out_corr = c.values(1);
+			int pos0 = c.pos(0), pos1 = c.pos(1);
+			for (int i = 0; i < n; i++) {
+				double tmp_sum = out_sum[pos0 + i] + sum[i];
+				if(Math.abs(out_sum[pos0 + i]) > Math.abs(sum[i])){
+					out_corr[pos1 + i] = ((out_sum[pos0 + i] - tmp_sum) + sum[i]) + out_corr[pos1 + i] + corr[i];
+				}
+				else {
+					out_corr[pos1 + i] = ((sum[i] - tmp_sum) + out_sum[pos0 + i]) + out_corr[pos1 + i] + corr[i];
+				}
+				out_sum[pos0 + i] = tmp_sum + out_corr[pos1 + i];
+			}
+		}
+		else{
+			for( int i=rl; i<ru; i++ )
+				sumAgg( a.values(i), c, a.pos(i), n, kbuff, kplus );
+		}
 	}
 
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index ce44e2e343b..0b5ffe2cc85 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -23,6 +23,7 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
@@ -41,6 +42,7 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP;
 import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlock.Type;
@@ -197,7 +199,10 @@ public static MatrixBlock matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock
 			ret = new MatrixBlock(m1.rlen, m2.clen, ultraSparse | sparse);
 		else 
 			ret.reset(m1.rlen, m2.clen, ultraSparse | sparse);
-		ret.allocateBlock();
+		if(m1.denseBlock instanceof DenseBlockFP64DEDUP)
+			ret.allocateDenseBlock(true, true);
+		else
+			ret.allocateBlock();
 		
 		// Detect if we should transpose skinny right side.
 		boolean tm2 = !fixedRet && checkPrepMatrixMultRightInput(m1,m2);
@@ -258,9 +263,10 @@ private static void parallelMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlo
 		try {
 			ExecutorService pool = CommonThreadPool.get(k);
 			ArrayList<MatrixMultTask> tasks = new ArrayList<>();
-			ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c));
+			ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c || ret.denseBlock instanceof DenseBlockFP64DEDUP));
+			ConcurrentHashMap<double[], double[]> cache = m1.denseBlock instanceof DenseBlockFP64DEDUP ? new ConcurrentHashMap(): null;
 			for(int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++)
-				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i)));
+				tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i), cache));
 			// execute tasks
 			List<Future<Object>> taskret = pool.invokeAll(tasks);
 			pool.shutdown();
@@ -1129,21 +1135,44 @@ private static void matrixMultDenseDenseMMSkinnyRHS(DenseBlock a, DenseBlock b,
 				cvals[cix+j] = dotProduct(avals, b.values(j), aix, b.pos(j), cd);
 		}
 	}
-	
+
+	public static void matrixMultDenseDenseMMDedup(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, ConcurrentHashMap<double[], double[]> cache) {
+		//n = m2.clen;
+		//cd = m1.clen;
+		for (int i = rl; i < ru; i++) {
+			double[] a_row = a.values(i);
+			double[] c_row = cache.getOrDefault(a_row, null);
+			if (c_row == null) {
+				c_row = new double[n];
+				for (int j = 0; j < n; j++) {
+					c_row[j] = 0.0;
+					//the following requires b.isContiguous(0,cd)
+					double[] b_column = b.values(0);
+					for (int k = 0; k < cd; k++) {
+						c_row[j] += a_row[k] * b_column[b.pos(k, j)];
+					}
+				}
+				//the following requires
+				cache.put(a_row, c_row);
+			}
+			c.set(i, c_row);
+		}
+	}
+
 	//note: public for use by codegen for consistency
 	public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) {
 		//1) Unrolled inner loop (for better instruction-level parallelism)
-		//2) Blocked execution (for less cache trashing in parallel exec) 
+		//2) Blocked execution (for less cache trashing in parallel exec)
 		//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
-		
-		final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block 
-		final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c 
-		final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan 
+
+		final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
+		final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
+		final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
 
 		//temporary arrays (nnz a, b index)
 		double[] ta = new double[ blocksizeK ];
 		int[]  tbi  = new int[ blocksizeK ];
-		
+
 		//blocked execution
 		for( int bi = rl; bi < ru; bi+=blocksizeI )
 			for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) 
@@ -4135,9 +4164,10 @@ private static class MatrixMultTask implements Callable<Object>
 		private final boolean _sparse; //sparse output
 		private final int _rl;
 		private final int _ru;
+		private final ConcurrentHashMap<double[], double[]> _cache;
 
 		protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
-			boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru )
+			boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru, ConcurrentHashMap<double[], double[]> cache )
 		{
 			_m1 = m1;
 			_m2 = m2;
@@ -4148,7 +4178,8 @@ protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret,
 			_sparse = sparse;
 			_rl = rl;
 			_ru = ru;
-			
+			_cache = cache;
+
 			if( pm2r ) { //vector-matrix / matrix-matrix
 				//allocate local result for partial aggregation
 				_ret = new MatrixBlock(ret.rlen, ret.clen, false);
@@ -4174,7 +4205,11 @@ public Object call() {
 			if( _ret.sparse ) //ultra-sparse
 				matrixMultUltraSparse(_m1, _m2, _ret, _m1Perm, rl, ru);
 			else if(!_m1.sparse && !_m2.sparse)
-				matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu);
+				if(_m1.denseBlock instanceof DenseBlockFP64DEDUP && _m2.denseBlock.isContiguous(0,_m1.clen) && cl == 0 && cu == _m2.clen)
+					matrixMultDenseDenseMMDedup(_m1.denseBlock, _m2.denseBlock, _ret.denseBlock, _m2.clen, _m1.clen, rl, ru, _cache);
+				else
+					matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu);
+
 			else if(_m1.sparse && _m2.sparse)
 				matrixMultSparseSparse(_m1, _m2, _ret, _pm2r, _sparse, rl, ru);
 			else if(_m1.sparse)
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index ebf4a6aeb3b..d0272548f45 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -30,6 +30,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
@@ -42,6 +43,7 @@
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.sysds.common.Types;
 import org.apache.sysds.common.Types.BlockType;
 import org.apache.sysds.common.Types.CorrectionLocationType;
 import org.apache.sysds.conf.ConfigurationManager;
@@ -56,6 +58,7 @@
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
+import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP;
 import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockCOO;
@@ -172,7 +175,11 @@ public MatrixBlock(int rl, int cl, long estnnz) {
 	public MatrixBlock(int rl, int cl, boolean sp, long estnnz) {
 		reset(rl, cl, sp, estnnz, 0);
 	}
-	
+
+	public MatrixBlock(int rl, int cl, boolean sp, long estnnz, boolean dedup) {
+		reset(rl, cl, sp, estnnz, 0, dedup);
+	}
+
 	public MatrixBlock(MatrixBlock that) {
 		copy(that);
 	}
@@ -298,7 +305,27 @@ public void reset(int rl, int cl, boolean sp, long estnnz, double val) {
 		else
 			resetDense(val);
 	}
-	
+
+	public void reset(int rl, int cl, boolean sp, long estnnz, double val, boolean dedup) {
+		//check for valid dimensions
+		if( rl < 0 || cl < 0 )
+			throw new RuntimeException("Invalid block dimensions: "+rl+" "+cl);
+
+		//reset basic meta data
+		rlen = rl;
+		clen = cl;
+		sparse = (val == 0) ? sp : false;
+		nonZeros = (val == 0) ? 0 : (long)rl*cl;
+		estimatedNNzsPerRow = (estnnz < 0 || !sparse) ? -1 :
+				(int)Math.ceil((double)estnnz/(double)rlen);
+
+		//reset sparse/dense blocks
+		if( sparse )
+			resetSparse();
+		else
+			resetDense(val, dedup);
+	}
+
 	private void resetSparse() {
 		if(sparseBlock == null)
 			return;
@@ -315,7 +342,18 @@ else if( val != 0 ) {
 			denseBlock.set(val);
 		}
 	}
-	
+
+	private void resetDense(double val, boolean dedup) {
+		//handle to dense block allocation and
+		//reset dense block to given value
+		if( denseBlock != null )
+			denseBlock.reset(rlen, clen, val);
+		else if( val != 0 ) {
+			allocateDenseBlock(false, dedup);
+			denseBlock.set(val);
+		}
+	}
+
 	/**
 	 * NOTE: This method is designed only for dense representation.
 	 * 
@@ -401,6 +439,10 @@ public boolean allocateDenseBlock(boolean clearNNZ, boolean containsDuplicates)
 			denseBlock = DenseBlockFactory.createDenseBlock(rlen, clen, containsDuplicates);
 			return true;
 		}
+		else if( containsDuplicates && !(denseBlock instanceof DenseBlockFP64DEDUP)) {
+			denseBlock = DenseBlockFactory.createDenseBlock(rlen, clen, true);
+			return true;
+		}
 		else if( denseBlock.capacity() < limit ){
 			denseBlock.reset(rlen, clen);
 			return true;
@@ -2016,6 +2058,11 @@ public void readFields(DataInput in)
 					cleanupBlock(false, true); //reuse dense
 					readDenseBlock(in); //always dense in-mem if dense on disk
 					break;
+				case DEDUP_BLOCK:
+					sparse = false;
+					cleanupBlock(false, true); //reuse dense
+					readDedupDenseBlock(in); //always dense in-mem if dense on disk
+					break;
 				case EMPTY_BLOCK:
 					sparse = true;
 					cleanupBlock(true, !(sparseBlock instanceof SparseBlockCSR));
@@ -2031,6 +2078,33 @@ public void readFields(DataInput in)
 		}
 	}
 
+	private void readDedupDenseBlock(DataInput in) throws IOException, DMLRuntimeException {
+		allocateDenseBlock(true,true);
+		DenseBlock a = getDenseBlock();
+		if(a.getDim(0) != rlen || a.getDim(1) != clen)
+			a.resetNoFill(rlen, clen); // reset the dimensions of a if incorrect.
+		HashMap<Integer, double[]> mapping = new HashMap<>();
+		for( int i=0; i<rlen; i++ ) {
+			Integer pos = in.readInt();
+			double[] row = mapping.get(pos);
+			if( row == null){
+				row = new double[clen];
+				mapping.put(pos, row);
+			}
+			a.set(i, row);
+		}
+		for (int i = 0; i < mapping.size(); i++) {
+			double[] row = mapping.get(i);
+			if (row == null) {
+				throw new DMLRuntimeException("serialized object is corrupt, did not find unique row number [" + i +"] in mappings");
+			}
+			for (int j = 0; j < clen; j++) {
+				row[j] = in.readDouble();
+			}
+		}
+		nonZeros = a.countNonZeros();
+	}
+
 	private void readDenseBlock(DataInput in) throws IOException, DMLRuntimeException {
 		// allocate dense block resets the block if already allocated.
 		allocateDenseBlock(true);
@@ -2165,7 +2239,7 @@ private void readUltraSparseToDense(DataInput in)
 	}
 	
 	@Override
-	public void write(DataOutput out) 
+	public void write(DataOutput out)
 		throws IOException 
 	{
 		//determine format
@@ -2197,12 +2271,55 @@ else if( isUltraSparseSerialize(sparseDst) )
 				writeDenseToUltraSparse(out);
 			else if( sparseDst )
 				writeDenseToSparse(out);
+			else if( denseBlock instanceof DenseBlockFP64DEDUP )
+				writeDedupDenseblock(out);
 			else
 				writeDenseBlock(out);
 		}
 	}
 
-	private static void writeEmptyBlock(DataOutput out) 
+	private void writeDedupDenseblock(DataOutput out)
+			throws IOException
+	{
+		out.writeByte( BlockType.DEDUP_BLOCK.ordinal() );
+
+		DenseBlockFP64DEDUP a = (DenseBlockFP64DEDUP) getDenseBlock();
+		if (rlen > a.numBlocks())
+			throw new DMLRuntimeException("Serialize DedupDenseblock: block does not contain enough rows ["+a.numBlocks() +" < " + rlen + "]");
+
+		HashMap<double[], Integer> mapping = new HashMap<>((int) (a.getNrDistinctRows()*1.1));
+		ArrayList<double[]> unique_rows = new ArrayList<>((int) (a.getNrDistinctRows()*1.1));
+
+		for(int i=0; i<rlen; i++) {
+			double[] avals = a.values(i); //equals 1 row
+			Integer pos = mapping.get(avals);
+			if (pos == null) {
+				pos = mapping.size();
+				unique_rows.add(avals);
+				mapping.put(avals, pos);
+			}
+			out.writeInt(pos);
+		}
+		if( mapping.size() != unique_rows.size() )
+			throw new DMLRuntimeException("Serialize DedupDenseblock: Map Size != Row Size");
+
+		if( out instanceof MatrixBlockDataOutput) { //fast serialize
+			MatrixBlockDataOutput mout = (MatrixBlockDataOutput)out;
+			for (double[] row : unique_rows) {
+				mout.writeDoubleArray(clen, row);
+			}
+		}
+		else { //general case (if fast serialize not supported)
+
+			for (double[] row : unique_rows) {
+				for (int i = 0; i < clen; i++) {
+					out.writeDouble(row[i]);
+				}
+			}
+		}
+	}
+
+	private static void writeEmptyBlock(DataOutput out)
 		throws IOException
 	{
 		//empty blocks do not need to materialize row information
@@ -2565,6 +2682,10 @@ public static long getHeaderSize() {
 	}
 	
 	public long estimateSizeInMemory() {
+		if (denseBlock instanceof DenseBlockFP64DEDUP) {
+			double size = getHeaderSize() + ((DenseBlockFP64DEDUP) denseBlock).estimateMemory();
+			return (long) Math.min(size, Long.MAX_VALUE);
+		}
 		return estimateSizeInMemory(rlen, clen, getSparsity());
 	}
 
@@ -2760,6 +2881,11 @@ public long getInMemorySize() {
 		//in-memory size given by header if not allocated
 		if( !isAllocated() ) 
 			return getHeaderSize();
+		//dedup dense block uses less in-memory than other dense blocks
+		if (denseBlock instanceof DenseBlockFP64DEDUP) {
+			double size = getHeaderSize() + ((DenseBlockFP64DEDUP) denseBlock).estimateMemory();
+			return (long) Math.min(size, Long.MAX_VALUE);
+		}
 		//in-memory size of dense/sparse representation
 		return !sparse ? estimateSizeDenseInMemory(rlen, clen) :
 			estimateSizeSparseInMemory(rlen, clen, getSparsity(),
@@ -5138,7 +5264,7 @@ public MatrixBlock replaceOperations(MatrixValue result, double pattern, double
 			ret.reset(rlen, clen, replacement);
 			return ret;
 		}
-		
+
 		boolean NaNpattern = Double.isNaN(pattern);
 		if( sparse ) //SPARSE
 		{
diff --git a/src/test/java/org/apache/sysds/test/functions/io/binary/SerializeTest.java b/src/test/java/org/apache/sysds/test/functions/io/binary/SerializeTest.java
index c3253388ae2..be1e6a1ac2c 100644
--- a/src/test/java/org/apache/sysds/test/functions/io/binary/SerializeTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/io/binary/SerializeTest.java
@@ -19,6 +19,10 @@
 
 package org.apache.sysds.test.functions.io.binary;
 
+import com.google.crypto.tink.subtle.Random;
+import org.apache.sysds.runtime.controlprogram.caching.ByteBuffer;
+import org.apache.sysds.runtime.util.FastBufferedDataOutputStream;
+import org.apache.sysds.runtime.util.LocalFileUtils;
 import org.junit.Assert;
 import org.junit.Test;
 import org.apache.sysds.common.Types.FileFormat;
@@ -31,7 +35,11 @@
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 
-public class SerializeTest extends AutomatedTestBase 
+import java.io.FileOutputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+
+public class SerializeTest extends AutomatedTestBase
 {
 	private final static String TEST_NAME = "SerializeTest";
 	private final static String TEST_DIR = "functions/io/binary/";
@@ -61,7 +69,13 @@ public void testDenseBlock()
 	{ 
 		runSerializeTest( rows1, cols1, 1.0 ); 
 	}
-	
+	@Test
+	public void testDedupDenseBlock()
+	{
+		runSerializeDedupDenseTest( rows1, cols1 );
+	}
+
+
 	@Test
 	public void testDenseSparseBlock() 
 	{ 
@@ -123,4 +137,58 @@ private void runSerializeTest( int rows, int cols, double sparsity )
 			throw new RuntimeException(ex);
 		}
 	}
+
+	private void runSerializeDedupDenseTest( int rows, int cols )
+	{
+		try
+		{
+			//generate actual dataset
+			double[][] X = getRandomMatrix(rows, cols, -1.0, 1.0, 1.0, 7);
+			double[][] X_duplicated = new double[rows*10][];
+			MatrixBlock mb = new MatrixBlock(rows*10, cols, false, 0, true);
+			mb.allocateDenseBlock(true, true);
+			HashMap<double[], Integer > seen = new HashMap<>();
+			for (int i = 0; i < rows*10; i++) {
+				int row = Random.randInt(rows);
+				Integer tmpPos = seen.get(X[row]);
+				if(tmpPos == null) {
+					tmpPos = seen.size();
+					seen.put(X[row], tmpPos);
+				}
+				X_duplicated[i] = X[row];
+				mb.quickSetRow(i, X[row]);
+			}
+
+			String fname = SCRIPT_DIR + TEST_DIR + "dedupSerializedBlock.out";
+			LocalFileUtils.writeCacheBlockToLocal(fname, mb);
+			MatrixBlock mb2 = (MatrixBlock) LocalFileUtils.readCacheBlockFromLocal(fname, true);
+
+			//compare matrices - values
+			for( int i=0; i<mb.getNumRows(); i++ )
+				for( int j=0; j<mb.getNumColumns(); j++ )
+				{
+					double val1 = mb.quickGetValue(i, j);
+					double val2 = mb2.quickGetValue(i, j);
+					Assert.assertEquals(val1, val2, eps);
+				}
+
+			//compare matrices - values
+			HashMap<double[], Integer > seen2 = new HashMap<>();
+			for( int i=0; i<mb.getNumRows(); i++ ){
+				double[] row = mb2.getDenseBlock().values(i);
+				Integer tmpPos = seen2.get(row);
+				if(tmpPos == null) {
+					tmpPos = seen2.size();
+					seen2.put(row, tmpPos);
+				}
+				Integer posMb1 = seen.get(mb.getDenseBlock().values(i));
+				Assert.assertEquals( (long) tmpPos, (long) posMb1);
+			}
+		}
+		catch(Exception ex)
+		{
+			ex.printStackTrace();
+			throw new RuntimeException(ex);
+		}
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbedding2Test.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbedding2Test.java
index 4787d35bcfd..6fb9f511eaa 100644
--- a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbedding2Test.java
+++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbedding2Test.java
@@ -41,259 +41,260 @@
 
 public class TransformFrameEncodeWordEmbedding2Test extends AutomatedTestBase
 {
-	private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddings2";
-	private final static String TEST_NAME2a = "TransformFrameEncodeWordEmbeddings2MultiCols1";
-	private final static String TEST_NAME2b = "TransformFrameEncodeWordEmbeddings2MultiCols2";
-
-	private final static String TEST_DIR = "functions/transform/";
-
-	@Override
-	public void setUp() {
-		TestUtils.clearAssertionInformation();
-		addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1));
-		addTestConfiguration(TEST_NAME2a, new TestConfiguration(TEST_DIR, TEST_NAME2a));
-		addTestConfiguration(TEST_NAME2b, new TestConfiguration(TEST_DIR, TEST_NAME2b));
-	}
-
-	@Test
-	public void testTransformToWordEmbeddings() {
-		runTransformTest(TEST_NAME1, ExecMode.SINGLE_NODE);
-	}
-
-	@Test
-	@Ignore
-	public void testNonRandomTransformToWordEmbeddings2Cols() {
-		runTransformTest(TEST_NAME2a, ExecMode.SINGLE_NODE);
-	}
-
-	@Test
-	@Ignore
-	public void testRandomTransformToWordEmbeddings4Cols() {
-		runTransformTestMultiCols(TEST_NAME2b, ExecMode.SINGLE_NODE);
-	}
-
-	@Test
-	@Ignore
-	public void runBenchmark(){
-		runBenchmark(TEST_NAME1, ExecMode.SINGLE_NODE);
-	}
-
-
-	private void runBenchmark(String testname, ExecMode rt)
-	{
-		//set runtime platform
-		ExecMode rtold = setExecMode(rt);
-		try
-		{
-			int rows = 100;
-			//int cols = 300;
-			getAndLoadTestConfiguration(testname);
-			fullDMLScriptName = getScript();
-
-			// Generate random embeddings for the distinct tokens
-			// double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
-
-			// Generate random distinct tokens
-			List<String> strings = generateRandomStrings(rows, 10);
-
-			// Generate the dictionary by assigning unique ID to each distinct token
-			writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
-
-			// Create the dataset by repeating and shuffling the distinct tokens
-			List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 320);
-			writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
-
-			//run script
-			programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
-			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-		}
-		catch(Exception ex) {
-			throw new RuntimeException(ex);
-
-		}
-		finally {
-			resetExecMode(rtold);
-		}
-	}
-
-	private void runTransformTest(String testname, ExecMode rt)
-	{
-		//set runtime platform
-		ExecMode rtold = setExecMode(rt);
-		try
-		{
-			int rows = 100;
-			int cols = 300;
-			getAndLoadTestConfiguration(testname);
-			fullDMLScriptName = getScript();
-
-			// Generate random embeddings for the distinct tokens
-			double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
-
-			// Generate random distinct tokens
-			List<String> strings = generateRandomStrings(rows, 10);
-
-			// Generate the dictionary by assigning unique ID to each distinct token
-			Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
-
-			// Create the dataset by repeating and shuffling the distinct tokens
-			List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 32);
-			writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
-
-			//run script
-			programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
-			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-
-			// Manually derive the expected result
-			double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
-
-			// Compare results
-			HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
-			double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual);
-			TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6);
-		}
-		catch(Exception ex) {
-			throw new RuntimeException(ex);
-
-		}
-		finally {
-			resetExecMode(rtold);
-		}
-	}
-
-	@SuppressWarnings("unused")
-	private void print2DimDoubleArray(double[][] resultActualDouble) {
-		Arrays.stream(resultActualDouble).forEach(
-				e -> System.out.println(Arrays.stream(e).mapToObj(d -> String.format("%06.1f", d))
-						.reduce("", (sub, elem) -> sub + " " + elem)));
-	}
-
-	private void runTransformTestMultiCols(String testname, ExecMode rt)
-	{
-		//set runtime platform
-		ExecMode rtold = setExecMode(rt);
-		try
-		{
-			int rows = 100;
-			int cols = 100;
-			getAndLoadTestConfiguration(testname);
-			fullDMLScriptName = getScript();
-
-			// Generate random embeddings for the distinct tokens
-			double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
-
-			// Generate random distinct tokens
-			List<String> strings = generateRandomStrings(rows, 10);
-
-			// Generate the dictionary by assigning unique ID to each distinct token
-			Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
-
-			// Create the dataset by repeating and shuffling the distinct tokens
-			List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 10);
-			writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
-
-			//run script
-			programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result"), output("result2")};
-			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
-
-			// Manually derive the expected result
-			double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
-
-			// Compare results
-			HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
-			HashMap<MatrixValue.CellIndex, Double> res_actual2 = readDMLMatrixFromOutputDir("result2");
-			double[][] resultActualDouble  = TestUtils.convertHashMapToDoubleArray(res_actual);
-			double[][] resultActualDouble2 = TestUtils.convertHashMapToDoubleArray(res_actual2);
-			//System.out.println("Actual Result1 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:");
-			///print2DimDoubleArray(resultActualDouble);
-			//System.out.println("\nActual Result2 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:");
-			//print2DimDoubleArray(resultActualDouble2);
-			//System.out.println("\nExpected Result [" + res_expected.length + "x" + res_expected[0].length + "]:");
-			//print2DimDoubleArray(res_expected);
-			TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6);
-			TestUtils.compareMatrices(resultActualDouble, resultActualDouble2, 1e-6);
-		}
-		catch(Exception ex) {
-			throw new RuntimeException(ex);
-
-		}
-		finally {
-			resetExecMode(rtold);
-		}
-	}
-
-	private double[][] manuallyDeriveWordEmbeddings(int cols, double[][] a, Map<String, Integer> map, List<String> stringsColumn) {
-		// Manually derive the expected result
-		double[][] res_expected = new double[stringsColumn.size()][cols];
-		for (int i = 0; i < stringsColumn.size(); i++) {
-			int rowMapped = map.get(stringsColumn.get(i));
-			System.arraycopy(a[rowMapped], 0, res_expected[i], 0, cols);
-		}
-		return res_expected;
-	}
-
-	@SuppressWarnings("unused")
-	private double[][] generateWordEmbeddings(int rows, int cols) {
-		double[][] a = new double[rows][cols];
-		for (int i = 0; i < a.length; i++) {
-			for (int j = 0; j < a[i].length; j++) {
-				a[i][j] = cols *i + j;
-			}
-
-		}
-		return a;
-	}
-
-	public static List<String> shuffleAndMultiplyStrings(List<String> strings, int multiply){
-		List<String> out = new ArrayList<>();
-		Random random = new Random();
-		for (int i = 0; i < strings.size()*multiply; i++) {
-			out.add(strings.get(random.nextInt(strings.size())));
-		}
-		return out;
-	}
-
-	public static List<String> generateRandomStrings(int numStrings, int stringLength) {
-		List<String> randomStrings = new ArrayList<>();
-		Random random = new Random();
-		String characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
-		for (int i = 0; i < numStrings; i++) {
-			randomStrings.add(generateRandomString(random, stringLength, characters));
-		}
-		return randomStrings;
-	}
-
-	public static String generateRandomString(Random random, int stringLength, String characters){
-		StringBuilder randomString = new StringBuilder();
-		for (int j = 0; j < stringLength; j++) {
-			int randomIndex = random.nextInt(characters.length());
-			randomString.append(characters.charAt(randomIndex));
-		}
-		return randomString.toString();
-	}
-
-	public static void writeStringsToCsvFile(List<String> strings, String fileName) {
-		try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) {
-			for (String line : strings) {
-				bw.write(line);
-				bw.newLine();
-			}
-		} catch (IOException e) {
-			e.printStackTrace();
-		}
-	}
-
-	public static Map<String,Integer> writeDictToCsvFile(List<String> strings, String fileName) {
-		try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) {
-			Map<String,Integer> map = new HashMap<>();
-			for (int i = 0; i < strings.size(); i++) {
-				map.put(strings.get(i), i);
-				bw.write(strings.get(i) + Lop.DATATYPE_PREFIX + (i+1) + "\n");
-			}
-			return map;
-		} catch (IOException e) {
-			e.printStackTrace();
-			return null;
-		}
-	}
+    private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddings2";
+    private final static String TEST_NAME2a = "TransformFrameEncodeWordEmbeddings2MultiCols1";
+    private final static String TEST_NAME2b = "TransformFrameEncodeWordEmbeddings2MultiCols2";
+
+    private final static String TEST_DIR = "functions/transform/";
+    private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeWordEmbedding1Test.class.getSimpleName() + "/";
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1));
+        addTestConfiguration(TEST_NAME2a, new TestConfiguration(TEST_DIR, TEST_NAME2a));
+        addTestConfiguration(TEST_NAME2b, new TestConfiguration(TEST_DIR, TEST_NAME2b));
+    }
+
+    @Test
+    public void testTransformToWordEmbeddings() {
+        runTransformTest(TEST_NAME1, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    @Ignore
+    public void testNonRandomTransformToWordEmbeddings2Cols() {
+        runTransformTest(TEST_NAME2a, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    @Ignore
+    public void testRandomTransformToWordEmbeddings4Cols() {
+        runTransformTestMultiCols(TEST_NAME2b, ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    @Ignore
+    public void runBenchmark(){
+        runBenchmark(TEST_NAME1, ExecMode.SINGLE_NODE);
+    }
+
+
+
+
+    private void runBenchmark(String testname, ExecMode rt)
+    {
+        //set runtime platform
+        ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 300;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 320);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+
+    private void runTransformTest(String testname, ExecMode rt)
+    {
+        //set runtime platform
+        ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 300;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 320);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+            // Manually derive the expected result
+            double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
+
+            // Compare results
+            HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
+            double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual);
+            TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+
+    public static void print2DimDoubleArray(double[][] resultActualDouble) {
+        Arrays.stream(resultActualDouble).forEach(
+                e -> System.out.println(Arrays.stream(e).mapToObj(d -> String.format("%06.1f", d))
+                        .reduce("", (sub, elem) -> sub + " " + elem)));
+    }
+
+    private void runTransformTestMultiCols(String testname, ExecMode rt)
+    {
+        //set runtime platform
+        ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 100;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 10);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result"), output("result2")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+            // Manually derive the expected result
+            double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
+
+            // Compare results
+            HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
+            HashMap<MatrixValue.CellIndex, Double> res_actual2 = readDMLMatrixFromOutputDir("result2");
+            double[][] resultActualDouble  = TestUtils.convertHashMapToDoubleArray(res_actual);
+            double[][] resultActualDouble2 = TestUtils.convertHashMapToDoubleArray(res_actual2);
+            //System.out.println("Actual Result1 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:");
+            print2DimDoubleArray(resultActualDouble);
+            //System.out.println("\nActual Result2 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:");
+            //print2DimDoubleArray(resultActualDouble2);
+            //System.out.println("\nExpected Result [" + res_expected.length + "x" + res_expected[0].length + "]:");
+            //print2DimDoubleArray(res_expected);
+            TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6);
+            TestUtils.compareMatrices(resultActualDouble, resultActualDouble2, 1e-6);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+
+    public static double[][] manuallyDeriveWordEmbeddings(int cols, double[][] a, Map<String, Integer> map, List<String> stringsColumn) {
+        // Manually derive the expected result
+        double[][] res_expected = new double[stringsColumn.size()][cols];
+        for (int i = 0; i < stringsColumn.size(); i++) {
+            int rowMapped = map.get(stringsColumn.get(i));
+            System.arraycopy(a[rowMapped], 0, res_expected[i], 0, cols);
+        }
+        return res_expected;
+    }
+
+    private double[][] generateWordEmbeddings(int rows, int cols) {
+        double[][] a = new double[rows][cols];
+        for (int i = 0; i < a.length; i++) {
+            for (int j = 0; j < a[i].length; j++) {
+                a[i][j] = cols *i + j;
+            }
+
+        }
+        return a;
+    }
+
+    public static List<String> shuffleAndMultiplyStrings(List<String> strings, int multiply){
+        List<String> out = new ArrayList<>();
+        Random random = new Random();
+        for (int i = 0; i < strings.size()*multiply; i++) {
+            out.add(strings.get(random.nextInt(strings.size())));
+        }
+        return out;
+    }
+
+    public static List<String> generateRandomStrings(int numStrings, int stringLength) {
+        List<String> randomStrings = new ArrayList<>();
+        Random random = new Random();
+        String characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+        for (int i = 0; i < numStrings; i++) {
+            randomStrings.add(generateRandomString(random, stringLength, characters));
+        }
+        return randomStrings;
+    }
+
+    public static String generateRandomString(Random random, int stringLength, String characters){
+        StringBuilder randomString = new StringBuilder();
+        for (int j = 0; j < stringLength; j++) {
+            int randomIndex = random.nextInt(characters.length());
+            randomString.append(characters.charAt(randomIndex));
+        }
+        return randomString.toString();
+    }
+
+    public static void writeStringsToCsvFile(List<String> strings, String fileName) {
+        try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) {
+            for (String line : strings) {
+                bw.write(line);
+                bw.newLine();
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public static Map<String,Integer> writeDictToCsvFile(List<String> strings, String fileName) {
+        try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) {
+            Map<String,Integer> map = new HashMap<>();
+            for (int i = 0; i < strings.size(); i++) {
+                map.put(strings.get(i), i);
+                bw.write(strings.get(i) + Lop.DATATYPE_PREFIX + (i+1) + "\n");
+            }
+            return map;
+        } catch (IOException e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingMMTest.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingMMTest.java
new file mode 100644
index 00000000000..3862294ca6f
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingMMTest.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+package org.apache.sysds.test.functions.transform;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.sysds.test.functions.transform.TransformFrameEncodeWordEmbedding2Test.*;
+
+public class TransformFrameEncodeWordEmbeddingMMTest extends AutomatedTestBase {
+    private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddingsMM";
+    private final static String TEST_DIR = "functions/transform/";
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1));
+    }
+
+    @Test
+    public void testMultiplication() {
+        runMatrixMultiplicationTest(TEST_NAME1, Types.ExecMode.SINGLE_NODE);
+    }
+
+    private void runMatrixMultiplicationTest(String testname, Types.ExecMode rt)
+    {
+        //set runtime platform
+        Types.ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 300;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+            double[][] b = createRandomMatrix("factor", cols, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            int factor = 320;
+            rows *= factor;
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, factor);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), input("factor"), output("result")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+            // Manually derive the expected result
+            double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
+            double[][] res_expectedMM = new double[rows][cols];
+            for (int i = 0; i < res_expectedMM.length; i++) {
+                for (int j = 0; j < res_expectedMM[i].length; j++) {
+                    res_expectedMM[i][j] = 0.0;
+                    for (int k = 0; k < res_expected[i].length; k++) {
+                        res_expectedMM[i][j] += res_expected[i][k]*b[k][j];
+                    }
+                }
+            }
+            // Compare results
+            HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
+            double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual);
+            //print2DimDoubleArray(resultActualDouble);
+            TestUtils.compareMatrices(res_expectedMM, resultActualDouble, 1e-8);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingRowSumTest.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingRowSumTest.java
new file mode 100644
index 00000000000..b3a09f3ec87
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingRowSumTest.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.transform;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.functionobjects.KahanPlus;
+import org.apache.sysds.runtime.instructions.cp.KahanObject;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.sysds.runtime.functionobjects.KahanPlus.getKahanPlusFnObject;
+import static org.apache.sysds.test.functions.transform.TransformFrameEncodeWordEmbedding2Test.*;
+
+public class TransformFrameEncodeWordEmbeddingRowSumTest extends AutomatedTestBase {
+    private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddingsRowSum";
+    private final static String TEST_NAME2 = "TransformFrameEncodeWordEmbeddingsColSum";
+    private final static String TEST_NAME3 = "TransformFrameEncodeWordEmbeddingsFullSum";
+    private final static String TEST_DIR = "functions/transform/";
+
+    @Override
+    public void setUp() {
+        TestUtils.clearAssertionInformation();
+        addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1));
+        addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_DIR, TEST_NAME2));
+        addTestConfiguration(TEST_NAME3, new TestConfiguration(TEST_DIR, TEST_NAME3));
+    }
+
+    @Test
+    public void testDedupRowSums() {
+        runDedupRowSumTest(TEST_NAME1, Types.ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testDedupColSums() {
+        runDedupColSumTest(TEST_NAME2, Types.ExecMode.SINGLE_NODE);
+    }
+
+    @Test
+    public void testDedupFullSums() {
+        runDedupFullSumTest(TEST_NAME3, Types.ExecMode.SINGLE_NODE);
+    }
+
+    private void runDedupFullSumTest(String testname, Types.ExecMode rt)
+    {
+        //set runtime platform
+        Types.ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 300;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 320*6);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+            // Manually derive the expected result
+            double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
+            double[][] sums_expected = new double[1][1];
+            KahanObject ko = new KahanObject(0,0);
+            KahanPlus kp = getKahanPlusFnObject();
+            for (int i = 0; i < res_expected.length; i++) {
+                for (int j = 0; j < res_expected[i].length; j++) {
+                    kp.execute2(ko,  res_expected[i][j]);
+                }
+            }
+            sums_expected[0][0] = ko._sum;
+            // Compare results
+            HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
+            double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual);
+            //print2DimDoubleArray(resultActualDouble);
+            TestUtils.compareMatrices(sums_expected, resultActualDouble, 1e-14);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+
+    private void runDedupColSumTest(String testname, Types.ExecMode rt)
+    {
+        //set runtime platform
+        Types.ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 300;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 320*6);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+            // Manually derive the expected result
+            double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
+            double[][] sums_expected = new double[1][res_expected[0].length];
+            KahanObject ko = new KahanObject(0,0);
+            KahanPlus kp = getKahanPlusFnObject();
+            for (int i = 0; i < res_expected[0].length; i++) {
+                ko.set(0,0);
+                for (int j = 0; j < res_expected.length; j++) {
+                    kp.execute2(ko,  res_expected[j][i]);
+                }
+                sums_expected[0][i] = ko._sum;
+            }
+            // Compare results
+            HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
+            double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual);
+            //print2DimDoubleArray(resultActualDouble);
+            TestUtils.compareMatrices(sums_expected, resultActualDouble, 1e-9);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+
+    private void runDedupRowSumTest(String testname, Types.ExecMode rt)
+    {
+        //set runtime platform
+        Types.ExecMode rtold = setExecMode(rt);
+        try
+        {
+            int rows = 100;
+            int cols = 300;
+            getAndLoadTestConfiguration(testname);
+            fullDMLScriptName = getScript();
+
+            // Generate random embeddings for the distinct tokens
+            double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime());
+
+            // Generate random distinct tokens
+            List<String> strings = generateRandomStrings(rows, 10);
+
+            // Generate the dictionary by assigning unique ID to each distinct token
+            Map<String,Integer> map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict");
+
+            // Create the dataset by repeating and shuffling the distinct tokens
+            List<String> stringsColumn = shuffleAndMultiplyStrings(strings, 320);
+            writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data");
+
+            //run script
+            programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")};
+            runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+            // Manually derive the expected result
+            double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn);
+            double[][] sums_expected = new double[res_expected.length][1];
+            KahanObject ko = new KahanObject(0,0);
+            KahanPlus kp = getKahanPlusFnObject();
+            for (int i = 0; i < res_expected.length; i++) {
+                ko.set(0,0);
+                for (int j = 0; j < res_expected[i].length; j++) {
+                    kp.execute2(ko,  res_expected[i][j]);
+                }
+                sums_expected[i][0] = ko._sum;
+            }
+            // Compare results
+            HashMap<MatrixValue.CellIndex, Double> res_actual = readDMLMatrixFromOutputDir("result");
+            double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual);
+            //print2DimDoubleArray(resultActualDouble);
+            TestUtils.compareMatrices(sums_expected, resultActualDouble, 1e-15);
+        }
+        catch(Exception ex) {
+            throw new RuntimeException(ex);
+
+        }
+        finally {
+            resetExecMode(rtold);
+        }
+    }
+}
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsColSum.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsColSum.dml
new file mode 100644
index 00000000000..59e79631834
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsColSum.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read the pre-trained word embeddings
+E = read($1, rows=100, cols=300, format="text");
+# Read the token sequence (1K) w/ 100 distinct tokens
+Data = read($2, data_type="frame", format="csv");
+# Read the recode map for the distinct tokens
+Meta = read($3, data_type="frame", format="csv");
+
+jspec = "{ids: true, word_embedding: [1]}";
+Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E);
+MatrixRowSum = colSums(Data_enc);
+write(MatrixRowSum, $4, format="text");
\ No newline at end of file
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsFullSum.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsFullSum.dml
new file mode 100644
index 00000000000..5b5fb81303e
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsFullSum.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read the pre-trained word embeddings
+E = read($1, rows=100, cols=300, format="text");
+# Read the token sequence (1K) w/ 100 distinct tokens
+Data = read($2, data_type="frame", format="csv");
+# Read the recode map for the distinct tokens
+Meta = read($3, data_type="frame", format="csv");
+
+jspec = "{ids: true, word_embedding: [1]}";
+Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E);
+MatrixRowSum = matrix(sum(Data_enc),1,1)
+write(MatrixRowSum, $4, format="text");
\ No newline at end of file
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsMM.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsMM.dml
new file mode 100644
index 00000000000..c439ef50d7b
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsMM.dml
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read the pre-trained word embeddings
+E = read($1, rows=100, cols=300, format="text");
+# Read the token sequence (1K) w/ 100 distinct tokens
+Data = read($2, data_type="frame", format="csv");
+# Read the recode map for the distinct tokens
+Meta = read($3, data_type="frame", format="csv");
+#Read the matrix that is used for multiplication after transform
+MM = read($4, rows=300, cols=300, format="text");
+
+jspec = "{ids: true, word_embedding: [1]}";
+Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E);
+Product = Data_enc %*% MM
+write(Product, $5, format="text");
+
+
+
+
diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsRowSum.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsRowSum.dml
new file mode 100644
index 00000000000..2ce047ba39d
--- /dev/null
+++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsRowSum.dml
@@ -0,0 +1,32 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Read the pre-trained word embeddings
+E = read($1, rows=100, cols=300, format="text");
+# Read the token sequence (1K) w/ 100 distinct tokens
+Data = read($2, data_type="frame", format="csv");
+# Read the recode map for the distinct tokens
+Meta = read($3, data_type="frame", format="csv");
+
+jspec = "{ids: true, word_embedding: [1]}";
+Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E);
+MatrixRowSum = rowSums(Data_enc);
+write(MatrixRowSum, $4, format="text");
\ No newline at end of file