diff --git a/src/main/java/org/apache/sysds/common/Types.java b/src/main/java/org/apache/sysds/common/Types.java index 78ed173bc9e..7bf48f5da5e 100644 --- a/src/main/java/org/apache/sysds/common/Types.java +++ b/src/main/java/org/apache/sysds/common/Types.java @@ -213,6 +213,7 @@ public enum BlockType{ ULTRA_SPARSE_BLOCK, SPARSE_BLOCK, DENSE_BLOCK, + DEDUP_BLOCK, } /** diff --git a/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java b/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java index b2273faa75b..e9428eb3714 100644 --- a/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java +++ b/src/main/java/org/apache/sysds/runtime/data/DenseBlockFP64DEDUP.java @@ -19,236 +19,281 @@ package org.apache.sysds.runtime.data; -import org.apache.commons.lang3.NotImplementedException; +import org.apache.commons.lang.NotImplementedException; import org.apache.sysds.common.Types; import org.apache.sysds.runtime.util.UtilFunctions; +import org.apache.sysds.utils.MemoryEstimates; import java.util.Arrays; import java.util.HashMap; public class DenseBlockFP64DEDUP extends DenseBlockDRB{ - private static final long serialVersionUID = 4124905190428752213L; - private double[][] _data; - - protected DenseBlockFP64DEDUP(int[] dims) { - super(dims); - reset(_rlen, _odims, 0); - } - - @Override - protected void allocateBlock(int bix, int length) { - _data[bix] = new double[length]; - } - - @Override - public void reset(int rlen, int[] odims, double v) { - if(rlen > capacity() / _odims[0]) - _data = new double[rlen][]; - else{ - if(v == 0.0){ - for(int i = 0; i < rlen; i++) - _data[i] = null; - } - else { - for(int i = 0; i < rlen; i++){ - if(odims[0] > _odims[0] ||_data[i] == null ) - allocateBlock(i, odims[0]); - Arrays.fill(_data[i], 0, odims[0], v); - } - } - } - _rlen = rlen; - _odims = odims; - } - - @Override - public void resetNoFill(int rlen, int[] odims) { - if(_data == null || rlen > _rlen){ - _data = new double[rlen][]; - } - _rlen = rlen; - _odims = odims; - } - - @Override - public boolean isNumeric() { - return true; - } - - @Override - public boolean isNumeric(Types.ValueType vt) { - return Types.ValueType.FP64 == vt; - } - - @Override - public long capacity() { - return (_data != null) ? _data.length*_odims[0] : -1; - } - - @Override - public long countNonZeros(){ - long nnz = 0; - HashMap cache = new HashMap<>(); - for (int i = 0; i < _rlen; i++) { - double[] row = this._data[i]; - if(row == null) - continue; - Long count = cache.getOrDefault(row, null); - if(count == null){ - count = Long.valueOf(countNonZeros(i)); - cache.put(row, count); - } - nnz += count; - } - return nnz; - } - - @Override - public int countNonZeros(int r) { - return _data[r] == null ? 0 : UtilFunctions.computeNnz(_data[r], 0, _odims[0]); - } - - @Override - protected long computeNnz(int bix, int start, int length) { - int nnz = 0; - int row_start = (int) Math.floor(start / _odims[0]); - int col_start = start % _odims[0]; - for (int i = 0; i < length; i++) { - if(_data[row_start] == null){ - i += _odims[0] - 1 - col_start; - col_start = 0; - row_start += 1; - continue; - } - nnz += _data[row_start][col_start] != 0 ? 1 : 0; - col_start += 1; - if(col_start == _odims[0]) { - col_start = 0; - row_start += 1; - } - } - return nnz; - } - - @Override - public int pos(int r){ - return 0; - } - - @Override - public int pos(int[] ix){ - int pos = ix[ix.length - 1]; - for(int i = 1; i < ix.length - 1; i++) - pos += ix[i] * _odims[i]; - return pos; - } - - @Override - public double[] values(int r) { - return valuesAt(r); - } - - @Override - public double[] valuesAt(int bix) { - return _data[bix] == null ? new double[_odims[0]] : _data[bix]; - } - - @Override - public int index(int r) { - return r; - } - - @Override - public int numBlocks(){ - return _data.length; - } - - @Override - public int size(int bix) { - return _odims[0]; - } - - @Override - public void incr(int r, int c) { - incr(r,c,1.0); - } - - @Override - public void incr(int r, int c, double delta) { - if(_data[r] == null) - allocateBlock(r, _odims[0]); - _data[r][c] += delta; - } - - @Override - protected void fillBlock(int bix, int fromIndex, int toIndex, double v) { - if(_data[bix] == null) - allocateBlock(bix, _odims[0]); - Arrays.fill(_data[bix], fromIndex, toIndex, v); - } - - @Override - protected void setInternal(int bix, int ix, double v) { - set(bix, ix, v); - } - - @Override - public DenseBlock set(int r, int c, double v) { - if(_data[r] == null) - _data[r] = new double[_odims[0]]; - _data[r][c] = v; - return this; - } - - @Override - public DenseBlock set(int r, double[] v) { - if(v.length == _odims[0]) - _data[r] = v; - else - throw new RuntimeException("set Denseblock called with an array length [" + v.length +"], array to overwrite is of length [" + _odims[0] + "]"); - return this; - } - - @Override - public DenseBlock set(DenseBlock db) { - throw new NotImplementedException(); - } - - @Override - public DenseBlock set(int[] ix, double v) { - return set(ix[0], pos(ix), v); - } - - @Override - public DenseBlock set(int[] ix, long v) { - return set(ix[0], pos(ix), v); - } - - @Override - public DenseBlock set(int[] ix, String v) { - return set(ix[0], pos(ix), Double.parseDouble(v)); - } - - @Override - public double get(int r, int c) { - if(_data[r] == null) - return 0.0; - else - return _data[r][c]; - } - - @Override - public double get(int[] ix) { - return get(ix[0], pos(ix)); - } - - @Override - public String getString(int[] ix) { - return String.valueOf(get(ix[0], pos(ix))); - } - - @Override - public long getLong(int[] ix) { - return UtilFunctions.toLong(get(ix[0], pos(ix))); - } + private double[][] _data; + private int _distinct = 0; + + protected DenseBlockFP64DEDUP(int[] dims) { + super(dims); + reset(_rlen, _odims, 0); + } + + public int getNrDistinctRows(){ + return _distinct; + } + + @Override + protected void allocateBlock(int bix, int length) { + _data[bix] = new double[length]; + } + + @Override + public void reset(int rlen, int[] odims, double v) { + if(rlen > capacity() / _odims[0]) + _data = new double[rlen][]; + else{ + if(v == 0.0){ + for(int i = 0; i < rlen; i++) + _data[i] = null; + } + else { + for(int i = 0; i < rlen; i++){ + if(odims[0] > _odims[0] ||_data[i] == null ) + allocateBlock(i, odims[0]); + Arrays.fill(_data[i], 0, odims[0], v); + } + } + } + _rlen = rlen; + _odims = odims; + } + + @Override + public void resetNoFill(int rlen, int[] odims) { + if(_data == null || rlen > _rlen){ + _data = new double[rlen][]; + } + _rlen = rlen; + _odims = odims; + } + + @Override + public boolean isNumeric() { + return true; + } + + @Override + public boolean isNumeric(Types.ValueType vt) { + return Types.ValueType.FP64 == vt; + } + + @Override + public long capacity() { + return (_data != null) ? ((long) _data.length)*_odims[0] : -1; + } + + + + @Override + public long countNonZeros(){ + long nnz = 0; + HashMap cache = new HashMap(); + for (int i = 0; i < _rlen; i++) { + double[] row = this._data[i]; + if(row == null) + continue; + Long count = cache.getOrDefault(row, null); + if(count == null){ + count = Long.valueOf(countNonZeros(i)); + cache.put(row, count); + } + nnz += count; + } + this._distinct = cache.size(); + return nnz; + } + + @Override + public int countNonZeros(int r) { + return _data[r] == null ? 0 : UtilFunctions.computeNnz(_data[r], 0, _odims[0]); + } + + @Override + public long countNonZeros(int rl, int ru, int ol, int ou) { + long nnz = 0; + HashMap cache = new HashMap(); + for (int i = rl; i < ru; i++) { + double[] row = this._data[i]; + if(row == null) + continue; + Long count = cache.getOrDefault(row, null); + if(count == null){ + count = Long.valueOf(UtilFunctions.computeNnz(_data[i], ol, ou)); + cache.put(row, count); + } + nnz += count; + } + return nnz; + } + + @Override + protected long computeNnz(int bix, int start, int length) { + int nnz = 0; + int row_start = (int) Math.floor(start / _odims[0]); + int col_start = start % _odims[0]; + for (int i = 0; i < length; i++) { + if(_data[row_start] == null){ + i += _odims[0] - 1 - col_start; + col_start = 0; + row_start += 1; + continue; + } + nnz += _data[row_start][col_start] != 0 ? 1 : 0; + col_start += 1; + if(col_start == _odims[0]) { + col_start = 0; + row_start += 1; + } + } + return nnz; + } + + @Override + public int pos(int r){ + return 0; + } + + @Override + public int pos(int r, int c){ + return c; + } + + @Override + public int pos(int[] ix){ + int pos = ix[ix.length - 1]; + for(int i = 1; i < ix.length - 1; i++) + pos += ix[i] * _odims[i]; + return pos; + } + + @Override + public int blockSize(int bix) { + return 1; + } + public boolean isContiguous(int rl, int ru){ + return rl == ru; + } + @Override + public double[] values(int r) { + return valuesAt(r); + } + + @Override + public double[] valuesAt(int bix) { + return _data[bix] == null ? new double[_odims[0]] : _data[bix]; + } + + @Override + public int index(int r) { + return r; + } + + @Override + public int numBlocks(){ + return _data.length; + } + + @Override + public int size(int bix) { + return _odims[0]; + } + + @Override + public void incr(int r, int c) { + incr(r,c,1.0); + } + + @Override + public void incr(int r, int c, double delta) { + if(_data[r] == null) + allocateBlock(r, _odims[0]); + _data[r][c] += delta; + } + + @Override + protected void fillBlock(int bix, int fromIndex, int toIndex, double v) { + if(_data[bix] == null) + allocateBlock(bix, _odims[0]); + Arrays.fill(_data[bix], fromIndex, toIndex, v); + } + + @Override + protected void setInternal(int bix, int ix, double v) { + set(bix, ix, v); + } + + @Override + public DenseBlock set(int r, int c, double v) { + if(_data[r] == null) + _data[r] = new double[_odims[0]]; + _data[r][c] = v; + return this; + } + + @Override + public DenseBlock set(int r, double[] v) { + if(v.length == _odims[0]) + _data[r] = v; + else + throw new RuntimeException("set Denseblock called with an array length [" + v.length +"], array to overwrite is of length [" + _odims[0] + "]"); + return this; + } + + @Override + public DenseBlock set(DenseBlock db) { + throw new NotImplementedException(); + } + + @Override + public DenseBlock set(int[] ix, double v) { + return set(ix[0], pos(ix), v); + } + + @Override + public DenseBlock set(int[] ix, long v) { + return set(ix[0], pos(ix), v); + } + + @Override + public DenseBlock set(int[] ix, String v) { + return set(ix[0], pos(ix), Double.parseDouble(v)); + } + + @Override + public double get(int r, int c) { + if(_data[r] == null) + return 0.0; + else + return _data[r][c]; + } + + @Override + public double get(int[] ix) { + return get(ix[0], pos(ix)); + } + + @Override + public String getString(int[] ix) { + return String.valueOf(get(ix[0], pos(ix))); + } + + @Override + public long getLong(int[] ix) { + return UtilFunctions.toLong(get(ix[0], pos(ix))); + } + + public double estimateMemory(){ + if( (double)_rlen + this._odims[0] > Long.MAX_VALUE ) + return Long.MAX_VALUE; + return DenseBlock.estimateMemory(_rlen, _odims[0]) + + MemoryEstimates.doubleArrayCost(_odims[0])*_distinct + MemoryEstimates.objectArrayCost(_rlen); + } } diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java index 2378f73169c..216aa533c56 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; @@ -34,6 +35,7 @@ import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType; import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysds.runtime.data.DenseBlock; +import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP; import org.apache.sysds.runtime.data.DenseBlockFactory; import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockCSR; @@ -1823,12 +1825,28 @@ else if( ixFn instanceof ReduceRow ) //COLVAR * @param ru row upper index */ private static void d_uakp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru ) { - final int bil = a.index(rl); - final int biu = a.index(ru-1); - for(int bi=bil; bi<=biu; bi++) { - int lpos = (bi==bil) ? a.pos(rl) : 0; - int len = (bi==biu) ? a.pos(ru-1)-lpos+n : a.blockSize(bi)*n; - sum(a.valuesAt(bi), lpos, len, kbuff, kplus); + if(a instanceof DenseBlockFP64DEDUP){ + HashMap counts = new HashMap<>(); + for( int i=rl; i { + for (int i = 0; i < row.length; i++) { + kplus.execute3(kbuff, row[i], count); + } + }); + } + else { + final int bil = a.index(rl); + final int biu = a.index(ru - 1); + for (int bi = bil; bi <= biu; bi++) { + int lpos = (bi == bil) ? a.pos(rl) : 0; + int len = (bi == biu) ? a.pos(ru - 1) - lpos + n : a.blockSize(bi) * n; + sum(a.valuesAt(bi), lpos, len, kbuff, kplus); + } } c.set(kbuff); } @@ -1846,10 +1864,27 @@ private static void d_uakp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff */ private static void d_uarkp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru ) { - for( int i=rl; i cache = new HashMap<>(); + for( int i=rl; i { + kbuff.set(0, 0); + sum(lambda_row , a.pos(finalI), n, kbuff, kplus ); + return new double[] {kbuff._sum, kbuff._correction}; + }); + cache.putIfAbsent(row, kbuff_array); + c.set(i, 0, kbuff_array[0]); + c.set(i, 1, kbuff_array[1]); + } + } + else { + for (int i = rl; i < ru; i++) { + kbuff.set(0, 0); //reset buffer + sum(a.values(i), a.pos(i), n, kbuff, kplus); + c.set(i, kbuff); + } } } @@ -1865,8 +1900,43 @@ private static void d_uarkp( DenseBlock a, DenseBlock c, int n, KahanObject kbuf * @param ru row upper index */ private static void d_uackp( DenseBlock a, DenseBlock c, int n, KahanObject kbuff, KahanPlus kplus, int rl, int ru ) { - for( int i=rl; i counts = new HashMap<>(); + for( int i=rl; i { + for (int i = 0; i < row.length; i++) { + kbuff._sum = sum[i]; + kbuff._correction = corr[i]; + kplus.execute3(kbuff, row[i], count); + sum[i] = kbuff._sum; + corr[i] = kbuff._correction; + } + }); + double[] out_sum = c.values(0); + double[] out_corr = c.values(1); + int pos0 = c.pos(0), pos1 = c.pos(1); + for (int i = 0; i < n; i++) { + double tmp_sum = out_sum[pos0 + i] + sum[i]; + if(Math.abs(out_sum[pos0 + i]) > Math.abs(sum[i])){ + out_corr[pos1 + i] = ((out_sum[pos0 + i] - tmp_sum) + sum[i]) + out_corr[pos1 + i] + corr[i]; + } + else { + out_corr[pos1 + i] = ((sum[i] - tmp_sum) + out_sum[pos0 + i]) + out_corr[pos1 + i] + corr[i]; + } + out_sum[pos0 + i] = tmp_sum + out_corr[pos1 + i]; + } + } + else{ + for( int i=rl; i tasks = new ArrayList<>(); - ArrayList blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c)); + ArrayList blklens = UtilFunctions.getBalancedBlockSizesDefault(num, k, (pm2r || pm2c || ret.denseBlock instanceof DenseBlockFP64DEDUP)); + ConcurrentHashMap cache = m1.denseBlock instanceof DenseBlockFP64DEDUP ? new ConcurrentHashMap(): null; for(int i = 0, lb = 0; i < blklens.size(); lb += blklens.get(i), i++) - tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i))); + tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2r, pm2c, m1Perm, sparse, lb, lb + blklens.get(i), cache)); // execute tasks List> taskret = pool.invokeAll(tasks); pool.shutdown(); @@ -1129,21 +1135,44 @@ private static void matrixMultDenseDenseMMSkinnyRHS(DenseBlock a, DenseBlock b, cvals[cix+j] = dotProduct(avals, b.values(j), aix, b.pos(j), cd); } } - + + public static void matrixMultDenseDenseMMDedup(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, ConcurrentHashMap cache) { + //n = m2.clen; + //cd = m1.clen; + for (int i = rl; i < ru; i++) { + double[] a_row = a.values(i); + double[] c_row = cache.getOrDefault(a_row, null); + if (c_row == null) { + c_row = new double[n]; + for (int j = 0; j < n; j++) { + c_row[j] = 0.0; + //the following requires b.isContiguous(0,cd) + double[] b_column = b.values(0); + for (int k = 0; k < cd; k++) { + c_row[j] += a_row[k] * b_column[b.pos(k, j)]; + } + } + //the following requires + cache.put(a_row, c_row); + } + c.set(i, c_row); + } + } + //note: public for use by codegen for consistency public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { //1) Unrolled inner loop (for better instruction-level parallelism) - //2) Blocked execution (for less cache trashing in parallel exec) + //2) Blocked execution (for less cache trashing in parallel exec) //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2) - - final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block - final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c - final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan + + final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block + final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c + final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan //temporary arrays (nnz a, b index) double[] ta = new double[ blocksizeK ]; int[] tbi = new int[ blocksizeK ]; - + //blocked execution for( int bi = rl; bi < ru; bi+=blocksizeI ) for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK ) @@ -4135,9 +4164,10 @@ private static class MatrixMultTask implements Callable private final boolean _sparse; //sparse output private final int _rl; private final int _ru; + private final ConcurrentHashMap _cache; protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, - boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru ) + boolean tm2, boolean pm2r, boolean pm2c, boolean m1Perm, boolean sparse, int rl, int ru, ConcurrentHashMap cache ) { _m1 = m1; _m2 = m2; @@ -4148,7 +4178,8 @@ protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, _sparse = sparse; _rl = rl; _ru = ru; - + _cache = cache; + if( pm2r ) { //vector-matrix / matrix-matrix //allocate local result for partial aggregation _ret = new MatrixBlock(ret.rlen, ret.clen, false); @@ -4174,7 +4205,11 @@ public Object call() { if( _ret.sparse ) //ultra-sparse matrixMultUltraSparse(_m1, _m2, _ret, _m1Perm, rl, ru); else if(!_m1.sparse && !_m2.sparse) - matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu); + if(_m1.denseBlock instanceof DenseBlockFP64DEDUP && _m2.denseBlock.isContiguous(0,_m1.clen) && cl == 0 && cu == _m2.clen) + matrixMultDenseDenseMMDedup(_m1.denseBlock, _m2.denseBlock, _ret.denseBlock, _m2.clen, _m1.clen, rl, ru, _cache); + else + matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2r, rl, ru, cl, cu); + else if(_m1.sparse && _m2.sparse) matrixMultSparseSparse(_m1, _m2, _ret, _pm2r, _sparse, rl, ru); else if(_m1.sparse) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java index ebf4a6aeb3b..d0272548f45 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java @@ -30,6 +30,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -42,6 +43,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.commons.math3.random.Well1024a; import org.apache.hadoop.io.DataInputBuffer; +import org.apache.sysds.common.Types; import org.apache.sysds.common.Types.BlockType; import org.apache.sysds.common.Types.CorrectionLocationType; import org.apache.sysds.conf.ConfigurationManager; @@ -56,6 +58,7 @@ import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.data.DenseBlockFP64; +import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP; import org.apache.sysds.runtime.data.DenseBlockFactory; import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.data.SparseBlockCOO; @@ -172,7 +175,11 @@ public MatrixBlock(int rl, int cl, long estnnz) { public MatrixBlock(int rl, int cl, boolean sp, long estnnz) { reset(rl, cl, sp, estnnz, 0); } - + + public MatrixBlock(int rl, int cl, boolean sp, long estnnz, boolean dedup) { + reset(rl, cl, sp, estnnz, 0, dedup); + } + public MatrixBlock(MatrixBlock that) { copy(that); } @@ -298,7 +305,27 @@ public void reset(int rl, int cl, boolean sp, long estnnz, double val) { else resetDense(val); } - + + public void reset(int rl, int cl, boolean sp, long estnnz, double val, boolean dedup) { + //check for valid dimensions + if( rl < 0 || cl < 0 ) + throw new RuntimeException("Invalid block dimensions: "+rl+" "+cl); + + //reset basic meta data + rlen = rl; + clen = cl; + sparse = (val == 0) ? sp : false; + nonZeros = (val == 0) ? 0 : (long)rl*cl; + estimatedNNzsPerRow = (estnnz < 0 || !sparse) ? -1 : + (int)Math.ceil((double)estnnz/(double)rlen); + + //reset sparse/dense blocks + if( sparse ) + resetSparse(); + else + resetDense(val, dedup); + } + private void resetSparse() { if(sparseBlock == null) return; @@ -315,7 +342,18 @@ else if( val != 0 ) { denseBlock.set(val); } } - + + private void resetDense(double val, boolean dedup) { + //handle to dense block allocation and + //reset dense block to given value + if( denseBlock != null ) + denseBlock.reset(rlen, clen, val); + else if( val != 0 ) { + allocateDenseBlock(false, dedup); + denseBlock.set(val); + } + } + /** * NOTE: This method is designed only for dense representation. * @@ -401,6 +439,10 @@ public boolean allocateDenseBlock(boolean clearNNZ, boolean containsDuplicates) denseBlock = DenseBlockFactory.createDenseBlock(rlen, clen, containsDuplicates); return true; } + else if( containsDuplicates && !(denseBlock instanceof DenseBlockFP64DEDUP)) { + denseBlock = DenseBlockFactory.createDenseBlock(rlen, clen, true); + return true; + } else if( denseBlock.capacity() < limit ){ denseBlock.reset(rlen, clen); return true; @@ -2016,6 +2058,11 @@ public void readFields(DataInput in) cleanupBlock(false, true); //reuse dense readDenseBlock(in); //always dense in-mem if dense on disk break; + case DEDUP_BLOCK: + sparse = false; + cleanupBlock(false, true); //reuse dense + readDedupDenseBlock(in); //always dense in-mem if dense on disk + break; case EMPTY_BLOCK: sparse = true; cleanupBlock(true, !(sparseBlock instanceof SparseBlockCSR)); @@ -2031,6 +2078,33 @@ public void readFields(DataInput in) } } + private void readDedupDenseBlock(DataInput in) throws IOException, DMLRuntimeException { + allocateDenseBlock(true,true); + DenseBlock a = getDenseBlock(); + if(a.getDim(0) != rlen || a.getDim(1) != clen) + a.resetNoFill(rlen, clen); // reset the dimensions of a if incorrect. + HashMap mapping = new HashMap<>(); + for( int i=0; i a.numBlocks()) + throw new DMLRuntimeException("Serialize DedupDenseblock: block does not contain enough rows ["+a.numBlocks() +" < " + rlen + "]"); + + HashMap mapping = new HashMap<>((int) (a.getNrDistinctRows()*1.1)); + ArrayList unique_rows = new ArrayList<>((int) (a.getNrDistinctRows()*1.1)); + + for(int i=0; i seen = new HashMap<>(); + for (int i = 0; i < rows*10; i++) { + int row = Random.randInt(rows); + Integer tmpPos = seen.get(X[row]); + if(tmpPos == null) { + tmpPos = seen.size(); + seen.put(X[row], tmpPos); + } + X_duplicated[i] = X[row]; + mb.quickSetRow(i, X[row]); + } + + String fname = SCRIPT_DIR + TEST_DIR + "dedupSerializedBlock.out"; + LocalFileUtils.writeCacheBlockToLocal(fname, mb); + MatrixBlock mb2 = (MatrixBlock) LocalFileUtils.readCacheBlockFromLocal(fname, true); + + //compare matrices - values + for( int i=0; i seen2 = new HashMap<>(); + for( int i=0; i strings = generateRandomStrings(rows, 10); - - // Generate the dictionary by assigning unique ID to each distinct token - writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); - - // Create the dataset by repeating and shuffling the distinct tokens - List stringsColumn = shuffleAndMultiplyStrings(strings, 320); - writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); - - //run script - programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; - runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); - } - catch(Exception ex) { - throw new RuntimeException(ex); - - } - finally { - resetExecMode(rtold); - } - } - - private void runTransformTest(String testname, ExecMode rt) - { - //set runtime platform - ExecMode rtold = setExecMode(rt); - try - { - int rows = 100; - int cols = 300; - getAndLoadTestConfiguration(testname); - fullDMLScriptName = getScript(); - - // Generate random embeddings for the distinct tokens - double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); - - // Generate random distinct tokens - List strings = generateRandomStrings(rows, 10); - - // Generate the dictionary by assigning unique ID to each distinct token - Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); - - // Create the dataset by repeating and shuffling the distinct tokens - List stringsColumn = shuffleAndMultiplyStrings(strings, 32); - writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); - - //run script - programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; - runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); - - // Manually derive the expected result - double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); - - // Compare results - HashMap res_actual = readDMLMatrixFromOutputDir("result"); - double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); - TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6); - } - catch(Exception ex) { - throw new RuntimeException(ex); - - } - finally { - resetExecMode(rtold); - } - } - - @SuppressWarnings("unused") - private void print2DimDoubleArray(double[][] resultActualDouble) { - Arrays.stream(resultActualDouble).forEach( - e -> System.out.println(Arrays.stream(e).mapToObj(d -> String.format("%06.1f", d)) - .reduce("", (sub, elem) -> sub + " " + elem))); - } - - private void runTransformTestMultiCols(String testname, ExecMode rt) - { - //set runtime platform - ExecMode rtold = setExecMode(rt); - try - { - int rows = 100; - int cols = 100; - getAndLoadTestConfiguration(testname); - fullDMLScriptName = getScript(); - - // Generate random embeddings for the distinct tokens - double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); - - // Generate random distinct tokens - List strings = generateRandomStrings(rows, 10); - - // Generate the dictionary by assigning unique ID to each distinct token - Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); - - // Create the dataset by repeating and shuffling the distinct tokens - List stringsColumn = shuffleAndMultiplyStrings(strings, 10); - writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); - - //run script - programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result"), output("result2")}; - runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); - - // Manually derive the expected result - double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); - - // Compare results - HashMap res_actual = readDMLMatrixFromOutputDir("result"); - HashMap res_actual2 = readDMLMatrixFromOutputDir("result2"); - double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); - double[][] resultActualDouble2 = TestUtils.convertHashMapToDoubleArray(res_actual2); - //System.out.println("Actual Result1 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:"); - ///print2DimDoubleArray(resultActualDouble); - //System.out.println("\nActual Result2 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:"); - //print2DimDoubleArray(resultActualDouble2); - //System.out.println("\nExpected Result [" + res_expected.length + "x" + res_expected[0].length + "]:"); - //print2DimDoubleArray(res_expected); - TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6); - TestUtils.compareMatrices(resultActualDouble, resultActualDouble2, 1e-6); - } - catch(Exception ex) { - throw new RuntimeException(ex); - - } - finally { - resetExecMode(rtold); - } - } - - private double[][] manuallyDeriveWordEmbeddings(int cols, double[][] a, Map map, List stringsColumn) { - // Manually derive the expected result - double[][] res_expected = new double[stringsColumn.size()][cols]; - for (int i = 0; i < stringsColumn.size(); i++) { - int rowMapped = map.get(stringsColumn.get(i)); - System.arraycopy(a[rowMapped], 0, res_expected[i], 0, cols); - } - return res_expected; - } - - @SuppressWarnings("unused") - private double[][] generateWordEmbeddings(int rows, int cols) { - double[][] a = new double[rows][cols]; - for (int i = 0; i < a.length; i++) { - for (int j = 0; j < a[i].length; j++) { - a[i][j] = cols *i + j; - } - - } - return a; - } - - public static List shuffleAndMultiplyStrings(List strings, int multiply){ - List out = new ArrayList<>(); - Random random = new Random(); - for (int i = 0; i < strings.size()*multiply; i++) { - out.add(strings.get(random.nextInt(strings.size()))); - } - return out; - } - - public static List generateRandomStrings(int numStrings, int stringLength) { - List randomStrings = new ArrayList<>(); - Random random = new Random(); - String characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; - for (int i = 0; i < numStrings; i++) { - randomStrings.add(generateRandomString(random, stringLength, characters)); - } - return randomStrings; - } - - public static String generateRandomString(Random random, int stringLength, String characters){ - StringBuilder randomString = new StringBuilder(); - for (int j = 0; j < stringLength; j++) { - int randomIndex = random.nextInt(characters.length()); - randomString.append(characters.charAt(randomIndex)); - } - return randomString.toString(); - } - - public static void writeStringsToCsvFile(List strings, String fileName) { - try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) { - for (String line : strings) { - bw.write(line); - bw.newLine(); - } - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static Map writeDictToCsvFile(List strings, String fileName) { - try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) { - Map map = new HashMap<>(); - for (int i = 0; i < strings.size(); i++) { - map.put(strings.get(i), i); - bw.write(strings.get(i) + Lop.DATATYPE_PREFIX + (i+1) + "\n"); - } - return map; - } catch (IOException e) { - e.printStackTrace(); - return null; - } - } + private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddings2"; + private final static String TEST_NAME2a = "TransformFrameEncodeWordEmbeddings2MultiCols1"; + private final static String TEST_NAME2b = "TransformFrameEncodeWordEmbeddings2MultiCols2"; + + private final static String TEST_DIR = "functions/transform/"; + private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeWordEmbedding1Test.class.getSimpleName() + "/"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1)); + addTestConfiguration(TEST_NAME2a, new TestConfiguration(TEST_DIR, TEST_NAME2a)); + addTestConfiguration(TEST_NAME2b, new TestConfiguration(TEST_DIR, TEST_NAME2b)); + } + + @Test + public void testTransformToWordEmbeddings() { + runTransformTest(TEST_NAME1, ExecMode.SINGLE_NODE); + } + + @Test + @Ignore + public void testNonRandomTransformToWordEmbeddings2Cols() { + runTransformTest(TEST_NAME2a, ExecMode.SINGLE_NODE); + } + + @Test + @Ignore + public void testRandomTransformToWordEmbeddings4Cols() { + runTransformTestMultiCols(TEST_NAME2b, ExecMode.SINGLE_NODE); + } + + @Test + @Ignore + public void runBenchmark(){ + runBenchmark(TEST_NAME1, ExecMode.SINGLE_NODE); + } + + + + + private void runBenchmark(String testname, ExecMode rt) + { + //set runtime platform + ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 300; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + List stringsColumn = shuffleAndMultiplyStrings(strings, 320); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } + + private void runTransformTest(String testname, ExecMode rt) + { + //set runtime platform + ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 300; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + List stringsColumn = shuffleAndMultiplyStrings(strings, 320); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // Manually derive the expected result + double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); + + // Compare results + HashMap res_actual = readDMLMatrixFromOutputDir("result"); + double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); + TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } + + public static void print2DimDoubleArray(double[][] resultActualDouble) { + Arrays.stream(resultActualDouble).forEach( + e -> System.out.println(Arrays.stream(e).mapToObj(d -> String.format("%06.1f", d)) + .reduce("", (sub, elem) -> sub + " " + elem))); + } + + private void runTransformTestMultiCols(String testname, ExecMode rt) + { + //set runtime platform + ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 100; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + List stringsColumn = shuffleAndMultiplyStrings(strings, 10); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result"), output("result2")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // Manually derive the expected result + double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); + + // Compare results + HashMap res_actual = readDMLMatrixFromOutputDir("result"); + HashMap res_actual2 = readDMLMatrixFromOutputDir("result2"); + double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); + double[][] resultActualDouble2 = TestUtils.convertHashMapToDoubleArray(res_actual2); + //System.out.println("Actual Result1 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:"); + print2DimDoubleArray(resultActualDouble); + //System.out.println("\nActual Result2 [" + resultActualDouble.length + "x" + resultActualDouble[0].length + "]:"); + //print2DimDoubleArray(resultActualDouble2); + //System.out.println("\nExpected Result [" + res_expected.length + "x" + res_expected[0].length + "]:"); + //print2DimDoubleArray(res_expected); + TestUtils.compareMatrices(resultActualDouble, res_expected, 1e-6); + TestUtils.compareMatrices(resultActualDouble, resultActualDouble2, 1e-6); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } + + public static double[][] manuallyDeriveWordEmbeddings(int cols, double[][] a, Map map, List stringsColumn) { + // Manually derive the expected result + double[][] res_expected = new double[stringsColumn.size()][cols]; + for (int i = 0; i < stringsColumn.size(); i++) { + int rowMapped = map.get(stringsColumn.get(i)); + System.arraycopy(a[rowMapped], 0, res_expected[i], 0, cols); + } + return res_expected; + } + + private double[][] generateWordEmbeddings(int rows, int cols) { + double[][] a = new double[rows][cols]; + for (int i = 0; i < a.length; i++) { + for (int j = 0; j < a[i].length; j++) { + a[i][j] = cols *i + j; + } + + } + return a; + } + + public static List shuffleAndMultiplyStrings(List strings, int multiply){ + List out = new ArrayList<>(); + Random random = new Random(); + for (int i = 0; i < strings.size()*multiply; i++) { + out.add(strings.get(random.nextInt(strings.size()))); + } + return out; + } + + public static List generateRandomStrings(int numStrings, int stringLength) { + List randomStrings = new ArrayList<>(); + Random random = new Random(); + String characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + for (int i = 0; i < numStrings; i++) { + randomStrings.add(generateRandomString(random, stringLength, characters)); + } + return randomStrings; + } + + public static String generateRandomString(Random random, int stringLength, String characters){ + StringBuilder randomString = new StringBuilder(); + for (int j = 0; j < stringLength; j++) { + int randomIndex = random.nextInt(characters.length()); + randomString.append(characters.charAt(randomIndex)); + } + return randomString.toString(); + } + + public static void writeStringsToCsvFile(List strings, String fileName) { + try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) { + for (String line : strings) { + bw.write(line); + bw.newLine(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static Map writeDictToCsvFile(List strings, String fileName) { + try (BufferedWriter bw = new BufferedWriter(new FileWriter(fileName))) { + Map map = new HashMap<>(); + for (int i = 0; i < strings.size(); i++) { + map.put(strings.get(i), i); + bw.write(strings.get(i) + Lop.DATATYPE_PREFIX + (i+1) + "\n"); + } + return map; + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } } diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingMMTest.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingMMTest.java new file mode 100644 index 00000000000..3862294ca6f --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingMMTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.test.functions.transform; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.matrix.data.MatrixValue; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.sysds.test.functions.transform.TransformFrameEncodeWordEmbedding2Test.*; + +public class TransformFrameEncodeWordEmbeddingMMTest extends AutomatedTestBase { + private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddingsMM"; + private final static String TEST_DIR = "functions/transform/"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1)); + } + + @Test + public void testMultiplication() { + runMatrixMultiplicationTest(TEST_NAME1, Types.ExecMode.SINGLE_NODE); + } + + private void runMatrixMultiplicationTest(String testname, Types.ExecMode rt) + { + //set runtime platform + Types.ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 300; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + double[][] b = createRandomMatrix("factor", cols, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + int factor = 320; + rows *= factor; + List stringsColumn = shuffleAndMultiplyStrings(strings, factor); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), input("factor"), output("result")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // Manually derive the expected result + double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); + double[][] res_expectedMM = new double[rows][cols]; + for (int i = 0; i < res_expectedMM.length; i++) { + for (int j = 0; j < res_expectedMM[i].length; j++) { + res_expectedMM[i][j] = 0.0; + for (int k = 0; k < res_expected[i].length; k++) { + res_expectedMM[i][j] += res_expected[i][k]*b[k][j]; + } + } + } + // Compare results + HashMap res_actual = readDMLMatrixFromOutputDir("result"); + double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); + //print2DimDoubleArray(resultActualDouble); + TestUtils.compareMatrices(res_expectedMM, resultActualDouble, 1e-8); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingRowSumTest.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingRowSumTest.java new file mode 100644 index 00000000000..b3a09f3ec87 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeWordEmbeddingRowSumTest.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.transform; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.functionobjects.KahanPlus; +import org.apache.sysds.runtime.instructions.cp.KahanObject; +import org.apache.sysds.runtime.matrix.data.MatrixValue; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.sysds.runtime.functionobjects.KahanPlus.getKahanPlusFnObject; +import static org.apache.sysds.test.functions.transform.TransformFrameEncodeWordEmbedding2Test.*; + +public class TransformFrameEncodeWordEmbeddingRowSumTest extends AutomatedTestBase { + private final static String TEST_NAME1 = "TransformFrameEncodeWordEmbeddingsRowSum"; + private final static String TEST_NAME2 = "TransformFrameEncodeWordEmbeddingsColSum"; + private final static String TEST_NAME3 = "TransformFrameEncodeWordEmbeddingsFullSum"; + private final static String TEST_DIR = "functions/transform/"; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_DIR, TEST_NAME1)); + addTestConfiguration(TEST_NAME2, new TestConfiguration(TEST_DIR, TEST_NAME2)); + addTestConfiguration(TEST_NAME3, new TestConfiguration(TEST_DIR, TEST_NAME3)); + } + + @Test + public void testDedupRowSums() { + runDedupRowSumTest(TEST_NAME1, Types.ExecMode.SINGLE_NODE); + } + + @Test + public void testDedupColSums() { + runDedupColSumTest(TEST_NAME2, Types.ExecMode.SINGLE_NODE); + } + + @Test + public void testDedupFullSums() { + runDedupFullSumTest(TEST_NAME3, Types.ExecMode.SINGLE_NODE); + } + + private void runDedupFullSumTest(String testname, Types.ExecMode rt) + { + //set runtime platform + Types.ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 300; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + List stringsColumn = shuffleAndMultiplyStrings(strings, 320*6); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // Manually derive the expected result + double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); + double[][] sums_expected = new double[1][1]; + KahanObject ko = new KahanObject(0,0); + KahanPlus kp = getKahanPlusFnObject(); + for (int i = 0; i < res_expected.length; i++) { + for (int j = 0; j < res_expected[i].length; j++) { + kp.execute2(ko, res_expected[i][j]); + } + } + sums_expected[0][0] = ko._sum; + // Compare results + HashMap res_actual = readDMLMatrixFromOutputDir("result"); + double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); + //print2DimDoubleArray(resultActualDouble); + TestUtils.compareMatrices(sums_expected, resultActualDouble, 1e-14); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } + + private void runDedupColSumTest(String testname, Types.ExecMode rt) + { + //set runtime platform + Types.ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 300; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + List stringsColumn = shuffleAndMultiplyStrings(strings, 320*6); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // Manually derive the expected result + double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); + double[][] sums_expected = new double[1][res_expected[0].length]; + KahanObject ko = new KahanObject(0,0); + KahanPlus kp = getKahanPlusFnObject(); + for (int i = 0; i < res_expected[0].length; i++) { + ko.set(0,0); + for (int j = 0; j < res_expected.length; j++) { + kp.execute2(ko, res_expected[j][i]); + } + sums_expected[0][i] = ko._sum; + } + // Compare results + HashMap res_actual = readDMLMatrixFromOutputDir("result"); + double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); + //print2DimDoubleArray(resultActualDouble); + TestUtils.compareMatrices(sums_expected, resultActualDouble, 1e-9); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } + + private void runDedupRowSumTest(String testname, Types.ExecMode rt) + { + //set runtime platform + Types.ExecMode rtold = setExecMode(rt); + try + { + int rows = 100; + int cols = 300; + getAndLoadTestConfiguration(testname); + fullDMLScriptName = getScript(); + + // Generate random embeddings for the distinct tokens + double[][] a = createRandomMatrix("embeddings", rows, cols, 0, 10, 1, new Date().getTime()); + + // Generate random distinct tokens + List strings = generateRandomStrings(rows, 10); + + // Generate the dictionary by assigning unique ID to each distinct token + Map map = writeDictToCsvFile(strings, baseDirectory + INPUT_DIR + "dict"); + + // Create the dataset by repeating and shuffling the distinct tokens + List stringsColumn = shuffleAndMultiplyStrings(strings, 320); + writeStringsToCsvFile(stringsColumn, baseDirectory + INPUT_DIR + "data"); + + //run script + programArgs = new String[]{"-stats","-args", input("embeddings"), input("data"), input("dict"), output("result")}; + runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + + // Manually derive the expected result + double[][] res_expected = manuallyDeriveWordEmbeddings(cols, a, map, stringsColumn); + double[][] sums_expected = new double[res_expected.length][1]; + KahanObject ko = new KahanObject(0,0); + KahanPlus kp = getKahanPlusFnObject(); + for (int i = 0; i < res_expected.length; i++) { + ko.set(0,0); + for (int j = 0; j < res_expected[i].length; j++) { + kp.execute2(ko, res_expected[i][j]); + } + sums_expected[i][0] = ko._sum; + } + // Compare results + HashMap res_actual = readDMLMatrixFromOutputDir("result"); + double[][] resultActualDouble = TestUtils.convertHashMapToDoubleArray(res_actual); + //print2DimDoubleArray(resultActualDouble); + TestUtils.compareMatrices(sums_expected, resultActualDouble, 1e-15); + } + catch(Exception ex) { + throw new RuntimeException(ex); + + } + finally { + resetExecMode(rtold); + } + } +} diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsColSum.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsColSum.dml new file mode 100644 index 00000000000..59e79631834 --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsColSum.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Read the pre-trained word embeddings +E = read($1, rows=100, cols=300, format="text"); +# Read the token sequence (1K) w/ 100 distinct tokens +Data = read($2, data_type="frame", format="csv"); +# Read the recode map for the distinct tokens +Meta = read($3, data_type="frame", format="csv"); + +jspec = "{ids: true, word_embedding: [1]}"; +Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E); +MatrixRowSum = colSums(Data_enc); +write(MatrixRowSum, $4, format="text"); \ No newline at end of file diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsFullSum.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsFullSum.dml new file mode 100644 index 00000000000..5b5fb81303e --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsFullSum.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Read the pre-trained word embeddings +E = read($1, rows=100, cols=300, format="text"); +# Read the token sequence (1K) w/ 100 distinct tokens +Data = read($2, data_type="frame", format="csv"); +# Read the recode map for the distinct tokens +Meta = read($3, data_type="frame", format="csv"); + +jspec = "{ids: true, word_embedding: [1]}"; +Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E); +MatrixRowSum = matrix(sum(Data_enc),1,1) +write(MatrixRowSum, $4, format="text"); \ No newline at end of file diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsMM.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsMM.dml new file mode 100644 index 00000000000..c439ef50d7b --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsMM.dml @@ -0,0 +1,38 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Read the pre-trained word embeddings +E = read($1, rows=100, cols=300, format="text"); +# Read the token sequence (1K) w/ 100 distinct tokens +Data = read($2, data_type="frame", format="csv"); +# Read the recode map for the distinct tokens +Meta = read($3, data_type="frame", format="csv"); +#Read the matrix that is used for multiplication after transform +MM = read($4, rows=300, cols=300, format="text"); + +jspec = "{ids: true, word_embedding: [1]}"; +Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E); +Product = Data_enc %*% MM +write(Product, $5, format="text"); + + + + diff --git a/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsRowSum.dml b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsRowSum.dml new file mode 100644 index 00000000000..2ce047ba39d --- /dev/null +++ b/src/test/scripts/functions/transform/TransformFrameEncodeWordEmbeddingsRowSum.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Read the pre-trained word embeddings +E = read($1, rows=100, cols=300, format="text"); +# Read the token sequence (1K) w/ 100 distinct tokens +Data = read($2, data_type="frame", format="csv"); +# Read the recode map for the distinct tokens +Meta = read($3, data_type="frame", format="csv"); + +jspec = "{ids: true, word_embedding: [1]}"; +Data_enc = transformapply(target=Data, spec=jspec, meta=Meta, embedding=E); +MatrixRowSum = rowSums(Data_enc); +write(MatrixRowSum, $4, format="text"); \ No newline at end of file