Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/serialization minimap #23

Merged
merged 38 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c413d85
Adding ability to parse string with minimap
mineralntl Jun 1, 2023
afb6e5e
Creating mini-map during serialization
mineralntl Jun 2, 2023
2a61d7e
Changing to TreeSet for ordering purposes
mineralntl Jun 2, 2023
0a07c22
WIP forming new mini-map string
mineralntl Jun 5, 2023
2e7fa76
Ensuring ordered types during serialization
mineralntl Jun 6, 2023
02dbeb5
Removing duplicate unit test
mineralntl Jun 7, 2023
7f7602c
Adding ability to parse string with minimap
mineralntl Jun 1, 2023
2260602
Creating mini-map during serialization
mineralntl Jun 2, 2023
4fc2d1e
Changing to TreeSet for ordering purposes
mineralntl Jun 2, 2023
117df42
WIP forming new mini-map string
mineralntl Jun 5, 2023
bab744a
Ensuring ordered types during serialization
mineralntl Jun 6, 2023
6e5d9d6
Removing duplicate unit test
mineralntl Jun 7, 2023
0ea1e9e
Moving hard coded strings
mineralntl Jun 23, 2023
fea8721
Merge branch 'feature/serializationMinimap' of github.com:NationalSec…
mineralntl Jun 23, 2023
ecc74cd
Formatting
mineralntl Jun 23, 2023
898f404
Merge branch 'main' into feature/serializationMinimap
mineralntl Jul 6, 2023
1048cab
Merge branch 'main' into feature/serializationMinimap
mineralntl Aug 31, 2023
1a4f8f3
Merge branch 'main' into feature/serializationMinimap
mineralntl Oct 18, 2023
359f647
Merge branch 'main' into feature/serializationMinimap
ivakegg Nov 21, 2023
d500bb9
Merge branch 'main' into feature/serializationMinimap
mineralntl Feb 16, 2024
afb24ab
Removing old method calls
mineralntl Feb 21, 2024
ec2511a
Removing unnecessary exception throwing
mineralntl Feb 21, 2024
4a4dc02
Formatting
mineralntl Feb 21, 2024
cb6551e
Updating to remove HashSet to preserve ordering
mineralntl Feb 23, 2024
a0664ab
Updating unit tests
mineralntl Feb 23, 2024
167b1c9
Updating unit tests again
mineralntl Feb 23, 2024
d69c97f
Updating unit tests again again
mineralntl Feb 23, 2024
a7f6893
Formatting
mineralntl Feb 23, 2024
747f013
Removing old methods
mineralntl Feb 23, 2024
b1b87b2
Adding in fieldName creation
mineralntl Feb 23, 2024
0fa5693
Updates based on testing
mineralntl Feb 26, 2024
f692d8a
Returning immutable map
mineralntl Feb 26, 2024
66d34f4
Fixing concatenated dataTypes
mineralntl Mar 6, 2024
e0fb91d
Merge branch 'main' into feature/serializationMinimap
mineralntl Mar 6, 2024
ac4623a
Merge branch 'main' into feature/serializationMinimap
mineralntl Apr 11, 2024
f1148a3
Merge branch 'main' into feature/serializationMinimap
mineralntl May 21, 2024
3646dad
Merge branch 'main' into feature/serializationMinimap
mineralntl Jun 24, 2024
5448b22
Merge branch 'main' into feature/serializationMinimap
mineralntl Jun 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@
<groupId>gov.nsa.datawave.microservice</groupId>
<artifactId>accumulo-utils</artifactId>
</dependency>
<dependency>
<groupId>gov.nsa.datawave.microservice</groupId>
<artifactId>common-utils</artifactId>
<version>2.0.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>gov.nsa.datawave.microservice</groupId>
<artifactId>type-utils</artifactId>
Expand Down
271 changes: 179 additions & 92 deletions src/main/java/datawave/query/util/TypeMetadata.java
Original file line number Diff line number Diff line change
@@ -1,50 +1,89 @@
package datawave.query.util;

import com.google.common.base.Splitter;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;

import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import com.google.common.base.Splitter;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;

public class TypeMetadata implements Serializable {

private Set<String> ingestTypes = Sets.newHashSet();
private Set<String> ingestTypes = new TreeSet<>();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're going to make these TreeSets then the variable should be migrated to a SortedSet


private Set<String> fieldNames = Sets.newHashSet();
private Set<String> fieldNames = new TreeSet<>();

public Map<String,Integer> getIngestTypesMiniMap() {
return ingestTypesMiniMap;
}

public void setIngestTypesMiniMap(Map<String,Integer> ingestTypesMiniMap) {
this.ingestTypesMiniMap = ingestTypesMiniMap;
}

public Map<String,Integer> getDataTypesMiniMap() {
return dataTypesMiniMap;
}

public void setDataTypesMiniMap(Map<String,Integer> dataTypesMiniMap) {
this.dataTypesMiniMap = dataTypesMiniMap;
}

private Map<String,Integer> ingestTypesMiniMap;
private Map<String,Integer> dataTypesMiniMap;

// <ingestType, <fieldName, DataType(s)>>
protected Map<String,Multimap<String,String>> typeMetadata;

public static final Multimap<String,String> emptyMap = HashMultimap.create();

private static final String INGESTTYPE_PREFIX = "dts";
private static final String DATATYPES_PREFIX = "types";

public TypeMetadata() {
typeMetadata = Maps.newHashMap();
ingestTypesMiniMap = new TreeMap<>();
dataTypesMiniMap = new TreeMap<>();
}

public TypeMetadata(String in) {
typeMetadata = Maps.newHashMap();
ingestTypesMiniMap = new TreeMap<>();
dataTypesMiniMap = new TreeMap<>();
this.fromString(in);
}

public TypeMetadata(TypeMetadata in) {
typeMetadata = Maps.newHashMap();
ingestTypesMiniMap = new TreeMap<>();
dataTypesMiniMap = new TreeMap<>();
// make sure we do a deep copy to avoid access issues later
for (Map.Entry<String,Multimap<String,String>> entry : in.typeMetadata.entrySet()) {
for (Entry<String,Multimap<String,String>> entry : in.typeMetadata.entrySet()) {
this.typeMetadata.put(entry.getKey(), HashMultimap.create(entry.getValue()));
}
this.ingestTypes.addAll(in.ingestTypes);
this.fieldNames.addAll(in.fieldNames);
this.ingestTypesMiniMap.putAll(in.getIngestTypesMiniMap());
this.dataTypesMiniMap.putAll(in.getDataTypesMiniMap());
}

/**
Expand Down Expand Up @@ -223,7 +262,7 @@ public TypeMetadata filter(Set<String> datatypeFilter) {
}

TypeMetadata typeMetadata = new TypeMetadata();
typeMetadata.ingestTypes = datatypeFilter;
typeMetadata.ingestTypes.addAll(datatypeFilter);
typeMetadata.typeMetadata.putAll(localMap);
return typeMetadata;
}
Expand All @@ -232,100 +271,150 @@ public boolean isEmpty() {
return this.keySet().isEmpty();
}

private static String[] parse(String in, char c) {
List<String> list = Lists.newArrayList();
boolean inside = false;
int start = 0;
for (int i = 0; i < in.length(); i++) {
if (in.charAt(i) == '[')
inside = true;
if (in.charAt(i) == ']')
inside = false;
if (in.charAt(i) == c && !inside) {
list.add(in.substring(start, i));
start = i + 1;
}
}
list.add(in.substring(start));
return Iterables.toArray(list, String.class);
}

private static Map<String,Integer> parseTypes(String typeEntry) {
// dts:[0:ingest1,1:ingest2]
// types:[0:DateType,1:IntegerType,2:LcType]

// remove type designation and leading/trailing brackets
String types = typeEntry.split(":\\[")[1];
String typeEntries = types.substring(0, types.length() - 1);

Map<String,Integer> typeMap = new TreeMap<>();

for (String entry : typeEntries.split(",")) {
String[] entryParts = entry.split(":");
typeMap.put(entryParts[1], Integer.valueOf(entryParts[0]));
}

return typeMap;
}

public String toString() {
StringBuilder sb = new StringBuilder();

Set<String> fieldNames = Sets.newHashSet();
for (String ingestType : typeMetadata.keySet()) {
// create and append ingestTypes mini-map
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm looking at the minimap for one of the tests and I'm wondering if the serialization schema can be simplified (and by extension, if the serialization/deserialization code can be simplified).

Take this for example:

# schema proposed in merge request
dts:[0:ingest1,1:ingest2];types:[0:DateType,1:IntegerType,2:LcType];FIELD1:[0:2,1:0];FIELD2:[0:1,1:2];FIELD3:[0:0,1:0]

# alternate schema
ingest1,ingest2;DateType,IntegerType,LcType;FIELD1:[0:2,1:0];FIELD2:[0:1,1:2];FIELD3:[0:0,1:0]

# 1. split serialized string by a semi colon, if three pieces exist then we're dealing with a minified string
# 2. Split the ingest type component by comma into an array. Now we have an array of ingest types.
# 3. Split the Types component by comma into an array. Now we have an array of Types
# 4. Split the field component by comma and iterate through. As you iterate the ingest type and Type indexes get plugged into the two arrays created earlier. 

Serialization should be simpler as well. It's calling Joiner.on(comma/colon/semi-colon).join(ingesttype/type/etc)

sb.append("dts:[");
Iterator<String> ingestIter = ingestTypes.iterator();
for (int i = 0; i < ingestTypes.size(); i++) {
String ingestType = ingestIter.next();
sb.append(i).append(":");
sb.append(ingestType);
sb.append(ingestIter.hasNext() ? "," : "];");
getIngestTypesMiniMap().put(ingestType, i);
}

// create and append dataTypes mini-map
sb.append("types:[");
Iterator<Multimap<String,String>> typesIter = typeMetadata.values().iterator();
Set<String> dataTypes = new TreeSet<>();
while (typesIter.hasNext()) {
dataTypes.addAll(typesIter.next().values());
}

Iterator<String> dataIter = dataTypes.iterator();
for (int i = 0; i < dataTypes.size(); i++) {
String dataType = dataIter.next();
sb.append(i).append(":");
sb.append(dataType);
sb.append(dataIter.hasNext() ? "," : "];");
getDataTypesMiniMap().put(dataType, i);
}

// append fieldNames and their associated ingestTypes and Normalizers
// ensure ordering for ease of type -> mini-map mapping
Set<String> fieldNames = new TreeSet<>();
Set<String> ingestTypes = typeMetadata.keySet().stream().sorted().collect(Collectors.toCollection(LinkedHashSet::new));
for (String ingestType : ingestTypes) {
fieldNames.addAll(typeMetadata.get(ingestType).keySet());
}

for (String fieldName : fieldNames) {
if (sb.length() > 0) {
sb.append(';');
}

sb.append(fieldName).append(':');
sb.append('[');
boolean firstField = true;
for (String ingestType : typeMetadata.keySet()) {
if (!typeMetadata.get(ingestType).containsKey(fieldName))
Iterator<String> fieldIter = fieldNames.iterator();
while (fieldIter.hasNext()) {
String fieldName = fieldIter.next();
sb.append(fieldName).append(":[");
Iterator<String> iIter = ingestTypes.iterator();
while (iIter.hasNext()) {
String ingestType = iIter.next();
if (!typeMetadata.get(ingestType).containsKey(fieldName)) {
continue;
if (!firstField)
sb.append(';');
firstField = false;
sb.append(ingestType);
sb.append(':');
boolean first = true;
for (String type : typeMetadata.get(ingestType).get(fieldName)) {
if (!first)
sb.append(',');
sb.append(type);
first = false;
}
for (String dataType : typeMetadata.get(ingestType).get(fieldName)) {
sb.append(getIngestTypesMiniMap().get(ingestType)).append(':');
sb.append(getDataTypesMiniMap().get(dataType));
}
sb.append(iIter.hasNext() ? "," : "");
}
sb.append(']');
sb.append(fieldIter.hasNext() ? "];" : "]");
}

return sb.toString();
}

private void fromString(String data) {
// was:
// field1:a,b;field2:d,e;field3:y,z

// post-fix: String should look like this:
// field1:[type1:a,b;type2:b];field2:[type1:a,b;type2:a,c]
fieldNames = Sets.newHashSet();
String[] entries = parse(data, ';');
for (String entry : entries) {
String[] entrySplits = parse(entry, ':');
if (2 != entrySplits.length) {
// Do nothing
} else {
// entrySplits[1] looks like this:
// [type1:a,b;type2:b] - split it on the ';'
// get rid of the leading and trailing brackets:
entrySplits[1] = entrySplits[1].substring(1, entrySplits[1].length() - 1);
String[] values = parse(entrySplits[1], ';');

for (String value : values) {

String[] vs = Iterables.toArray(Splitter.on(':').omitEmptyStrings().trimResults().split(value), String.class);

if (entries.length > 2) {
for (String entry : entries) {
if (entry.startsWith(INGESTTYPE_PREFIX)) {
setIngestTypesMiniMap(parseTypes(entry));
} else if (entry.startsWith(DATATYPES_PREFIX)) {
setDataTypesMiniMap(parseTypes(entry));
} else {
String[] entrySplits = parse(entry, ':');

Multimap<String,String> mm = typeMetadata.get(vs[0]);
if (null == mm) {
mm = HashMultimap.create();
typeMetadata.put(vs[0], mm);
}
// get rid of the leading and trailing brackets:
entrySplits[1] = entrySplits[1].substring(1, entrySplits[1].length() - 1);
String[] values = parse(entrySplits[1], ',');

String[] rhs = Iterables.toArray(Splitter.on(',').omitEmptyStrings().trimResults().split(vs[1]), String.class);
this.ingestTypes.add(vs[0]);
for (String r : rhs) {
mm.put(entrySplits[0], r);
for (String aValue : values) {
if (!aValue.isEmpty()) { // ignore last entry for trailing comma
// @formatter:off
String[] vs = Iterables
.toArray(Splitter.on(':')
.omitEmptyStrings()
.trimResults()
.split(aValue), String.class);

String ingestType = ImmutableMap.copyOf(getIngestTypesMiniMap())
.entrySet()
.stream()
.filter(e -> e.getValue().equals(Integer.valueOf(vs[0])))
.map(Entry::getKey)
.findFirst().get();

String dataType = ImmutableMap.copyOf(getDataTypesMiniMap())
.entrySet()
.stream()
.filter(e -> e.getValue().equals(Integer.valueOf(vs[1])))
.map(Entry::getKey)
.findFirst().get();
// @formatter:on

this.addTypeMetadata(entrySplits[0], ingestType, dataType);
}
}
fieldNames.add(entrySplits[0]);
}
fieldNames.add(entrySplits[0]);
}
}
}

private static String[] parse(String in, char c) {
List<String> list = Lists.newArrayList();
boolean inside = false;
int start = 0;
for (int i = 0; i < in.length(); i++) {
if (in.charAt(i) == '[')
inside = true;
if (in.charAt(i) == ']')
inside = false;
if (in.charAt(i) == c && !inside) {
list.add(in.substring(start, i));
start = i + 1;
}
}
list.add(in.substring(start));
return Iterables.toArray(list, String.class);
}

@Override
Expand All @@ -346,20 +435,18 @@ public boolean equals(Object obj) {
return false;
TypeMetadata other = (TypeMetadata) obj;
if (typeMetadata == null) {
if (other.typeMetadata != null)
return false;
} else if (!toString().equals(obj.toString())) {
return false;
}
return true;
return other.typeMetadata == null;
} else
return toString().equals(obj.toString());
}

private void writeObject(ObjectOutputStream out) throws Exception {
out.writeObject(this.toString());
}

private void readObject(ObjectInputStream in) throws Exception {
this.ingestTypes = Sets.newHashSet();
this.ingestTypes = Sets.newTreeSet();
this.fieldNames = Sets.newTreeSet();
this.typeMetadata = Maps.newHashMap();
this.fromString((String) in.readObject());
}
Expand Down
Loading
Loading