Skip to content

Commit

Permalink
Fix transposed sparse encoding extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Baunsgaard committed Aug 18, 2023
1 parent 14f17ff commit 3d0f14c
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ else if(mb instanceof CompressedMatrixBlock && ((CompressedMatrixBlock) mb).isOv

res = new CompressedMatrixBlock(mb); // copy metadata and allocate soft reference

// LOG.error(mb);
classifyPhase();
if(compressionGroups == null)
return abortCompression();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,37 +115,44 @@ private CompressedSizeInfoColGroup extractInfo(IEncode map, IColIndex colIndexes

private EstimationFactors scaleFactors(EstimationFactors sampleFacts, IColIndex colIndexes, int maxDistinct,
boolean dense) {
final int numRows = getNumRows();
final int nCol = colIndexes.size();

final double scalingFactor = (double) numRows / _sampleSize;

final long nnz = calculateNNZ(colIndexes, scalingFactor);
final int numOffs = calculateOffs(sampleFacts, numRows, scalingFactor, colIndexes, (int) nnz);
final int estDistinct = distinctCountScale(sampleFacts, numOffs, numRows, maxDistinct, dense, nCol);

// calculate the largest instance count.
final int maxLargestInstanceCount = numRows - estDistinct + 1;
final int scaledLargestInstanceCount = sampleFacts.largestOff < 0 ? numOffs /
estDistinct : (int) Math.floor(sampleFacts.largestOff * scalingFactor);
final int mostFrequentOffsetCount = Math.max(Math.min(maxLargestInstanceCount, scaledLargestInstanceCount),
numRows - numOffs);

final double overallSparsity = calculateSparsity(colIndexes, nnz, scalingFactor, sampleFacts.overAllSparsity);
// For robustness safety add 10 percent more tuple sparsity
final double tupleSparsity = Math.min(overallSparsity * 1.3, 1.0); // increase sparsity by 30%.

if(_cs.isRLEAllowed()) {
final int scaledRuns = Math.max(estDistinct, calculateRuns(sampleFacts, scalingFactor, numOffs, estDistinct));
return new EstimationFactors(estDistinct, numOffs, mostFrequentOffsetCount, sampleFacts.frequencies,
sampleFacts.numSingle, numRows, scaledRuns, sampleFacts.lossy, sampleFacts.zeroIsMostFrequent,
overallSparsity, tupleSparsity);
try {

final int numRows = getNumRows();
final int nCol = colIndexes.size();

final double scalingFactor = (double) numRows / _sampleSize;

final long nnz = calculateNNZ(colIndexes, scalingFactor);
final int numOffs = calculateOffs(sampleFacts, numRows, scalingFactor, colIndexes, (int) nnz);
final int estDistinct = distinctCountScale(sampleFacts, numOffs, numRows, maxDistinct, dense, nCol);

// calculate the largest instance count.
final int maxLargestInstanceCount = numRows - estDistinct + 1;
final int scaledLargestInstanceCount = sampleFacts.largestOff < 0 ? numOffs /
estDistinct : (int) Math.floor(sampleFacts.largestOff * scalingFactor);
final int mostFrequentOffsetCount = Math.max(Math.min(maxLargestInstanceCount, scaledLargestInstanceCount),
numRows - numOffs);

final double overallSparsity = calculateSparsity(colIndexes, nnz, scalingFactor,
sampleFacts.overAllSparsity);
// For robustness safety add 10 percent more tuple sparsity
final double tupleSparsity = Math.min(overallSparsity * 1.3, 1.0); // increase sparsity by 30%.

if(_cs.isRLEAllowed()) {
final int scaledRuns = Math.max(estDistinct,
calculateRuns(sampleFacts, scalingFactor, numOffs, estDistinct));
return new EstimationFactors(estDistinct, numOffs, mostFrequentOffsetCount, sampleFacts.frequencies,
sampleFacts.numSingle, numRows, scaledRuns, sampleFacts.lossy, sampleFacts.zeroIsMostFrequent,
overallSparsity, tupleSparsity);
}
else
return new EstimationFactors(estDistinct, numOffs, mostFrequentOffsetCount, sampleFacts.frequencies,
sampleFacts.numSingle, numRows, sampleFacts.lossy, sampleFacts.zeroIsMostFrequent, overallSparsity,
tupleSparsity);
}
catch(Exception e) {
throw new RuntimeException(colIndexes.toString(), e);
}
else
return new EstimationFactors(estDistinct, numOffs, mostFrequentOffsetCount, sampleFacts.frequencies,
sampleFacts.numSingle, numRows, sampleFacts.lossy, sampleFacts.zeroIsMostFrequent, overallSparsity,
tupleSparsity);

}

private int distinctCountScale(EstimationFactors sampleFacts, int numOffs, int numRows, int maxDistinct,
Expand All @@ -157,7 +164,8 @@ private int distinctCountScale(EstimationFactors sampleFacts, int numOffs, int n
// sampled size is smaller than actual if there was empty rows.
// and the more we can reduce this value the more accurate the estimation will become.
final int sampledSize = sampleFacts.numOffs;
int est = SampleEstimatorFactory.distinctCount(freq, dense ? numRows : numOffs, sampledSize, _cs.estimationType);
int est = SampleEstimatorFactory.distinctCount(freq, dense ? numRows : numOffs, sampledSize,
_cs.estimationType);
if(est > 10000)
est += est * 0.5;
if(nCol > 4) // Increase estimate if we get into many columns cocoding to be safe
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.sysds.runtime.compress.estim;

import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.DMLCompressionException;

/**
Expand Down Expand Up @@ -94,10 +95,18 @@ else if(largestOff > numRows)
"Invalid number of instance of most common element should be lower than number of rows. " + largestOff
+ " > numRows: " + numRows);
else if(numVals > numOffs)
throw new DMLCompressionException("Num vals cannot be greater than num offs: vals: "+ numVals + " offs: " + numOffs);
throw new DMLCompressionException(
"Num vals cannot be greater than num offs: vals: " + numVals + " offs: " + numOffs);

if(CompressedMatrixBlock.debug && frequencies != null) {
for(int i = 0; i < frequencies.length; i++) {
if(frequencies[i] == 0)
throw new DMLCompressionException("Invalid counts in fact contains 0");
}
}
}

public int[] getFrequencies(){
public int[] getFrequencies() {
return frequencies;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
Expand All @@ -40,7 +41,14 @@ public class DenseEncoding extends AEncode {

public DenseEncoding(AMapToData map) {
this.map = map;
map.getCounts();

if(CompressedMatrixBlock.debug) {
int[] freq = map.getCounts();
for(int i = 0; i < freq.length; i++) {
if(freq[i] == 0)
throw new DMLCompressionException("Invalid counts in fact contains 0");
}
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ else if(alen - apos > nCol / 4) { // return a dense encoding
// Iteration 2 of non zero values, make either a IEncode Dense or sparse map.
for(int i = apos, j = 0; i < alen; i++, j++)
if(!Double.isNaN(avals[i]))
d.set(j, map.get(avals[i]));
d.set(j, map.getId(avals[i]));

// Iteration 3 of non zero indexes, make a Offset Encoding to know what cells are zero and not.
// not done yet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
Expand Down Expand Up @@ -53,6 +54,14 @@ protected SparseEncoding(AMapToData map, AOffset off, int nRows) {
this.map = map;
this.off = off;
this.nRows = nRows;

if(CompressedMatrixBlock.debug) {
int[] freq = map.getCounts();
for(int i = 0; i < freq.length; i++) {
if(freq[i] == 0)
throw new DMLCompressionException("Invalid counts in fact contains 0");
}
}
}

@Override
Expand Down Expand Up @@ -122,9 +131,7 @@ protected IEncode combineSparse(SparseEncoding e) {
}
}



private Pair<IEncode, Map<Integer, Integer>> combineSparseNoResizeDense(SparseEncoding e) {
private Pair<IEncode, Map<Integer, Integer>> combineSparseNoResizeDense(SparseEncoding e) {

final int fl = off.getOffsetToLast();
final int fr = e.off.getOffsetToLast();
Expand All @@ -134,7 +141,7 @@ private Pair<IEncode, Map<Integer, Integer>> combineSparseNoResizeDense(SparseE
final int nVr = e.getUnique();

final AMapToData retMap = MapToFactory.create(nRows, (nVl + 1) * (nVr + 1));

int il = itl.value();
// parse through one side set all values into the dense.
while(il < fl) {
Expand All @@ -155,16 +162,15 @@ private Pair<IEncode, Map<Integer, Integer>> combineSparseNoResizeDense(SparseE

// Full iteration to set unique elements.
final Map<Integer, Integer> m = new HashMap<>();
for(int i = 0 ; i < retMap.size(); i ++)
addValHashMap(retMap.getIndex(i), i,m, retMap );

for(int i = 0; i < retMap.size(); i++)
addValHashMap(retMap.getIndex(i), i, m, retMap);

return new ImmutablePair<>(new DenseEncoding(retMap.resize(m.size())), m);

}

}

protected static void addValHashMap(final int nv, final int r, final Map<Integer, Integer> map, final AMapToData d) {
protected static void addValHashMap(final int nv, final int r, final Map<Integer, Integer> map,
final AMapToData d) {
final int v = map.size();
final Integer mv = map.putIfAbsent(nv, v);
if(mv == null)
Expand All @@ -173,7 +179,6 @@ protected static void addValHashMap(final int nv, final int r, final Map<Integer
d.set(r, mv);
}


private static int combineSparse(AMapToData lMap, AMapToData rMap, AIterator itl, AIterator itr,
final IntArrayList retOff, final IntArrayList tmpVals, final int fl, final int fr, final int nVl, final int nVr,
final int[] d) {
Expand Down Expand Up @@ -382,7 +387,7 @@ public EstimationFactors extractFacts(int nRows, double tupleSparsity, double ma
CompressionSettings cs) {
final int largestOffs = nRows - map.size(); // known largest off is zero tuples
tupleSparsity = Math.min((double) map.size() / (double) nRows, tupleSparsity);
final int[] counts = map.getCounts(new int[map.getUnique()]);
final int[] counts = map.getCounts();

if(cs.isRLEAllowed())
return new EstimationFactors(map.getUnique(), map.size(), largestOffs, counts, 0, nRows, map.countRuns(off),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.sysds.runtime.compress.estim.sample;

import java.util.Arrays;
import java.util.HashMap;

import org.apache.commons.logging.Log;
Expand Down Expand Up @@ -108,20 +109,26 @@ private static int distinctCountWithHistogram(int numVals, int[] invHist, int[]
}

private static int[] getInvertedFrequencyHistogram(int[] frequencies) {
final int numVals = frequencies.length;
// Find max
int maxCount = 0;
for(int i = 0; i < numVals; i++) {
final int v = frequencies[i];
if(v > maxCount)
maxCount = v;
}

// create frequency histogram
int[] freqCounts = new int[maxCount];
for(int i = 0; i < numVals; i++)
freqCounts[frequencies[i] - 1]++;
try{

return freqCounts;
final int numVals = frequencies.length;
// Find max
int maxCount = 0;
for(int i = 0; i < numVals; i++) {
final int v = frequencies[i];
if(v > maxCount)
maxCount = v;
}

// create frequency histogram
int[] freqCounts = new int[maxCount];
for(int i = 0; i < numVals; i++)
freqCounts[frequencies[i] - 1]++;

return freqCounts;
}
catch(Exception e){
throw new RuntimeException(Arrays.toString(frequencies), e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public ACountHashMap(int arrSize) {
if(arrSize < shortCutSize)
data = create(1);
else {
arrSize = (int)(arrSize * (1.0 / LOAD_FACTOR));
arrSize = (int) (arrSize * (1.0 / LOAD_FACTOR));
arrSize += arrSize % 2;
data = create(arrSize);
}
Expand Down

0 comments on commit 3d0f14c

Please sign in to comment.