Skip to content

Commit

Permalink
Improved implementation of PrefixMapStd
Browse files Browse the repository at this point in the history
  • Loading branch information
Aklakan committed Aug 24, 2022
1 parent 772f84c commit b78216e
Show file tree
Hide file tree
Showing 2 changed files with 390 additions and 94 deletions.
306 changes: 212 additions & 94 deletions jena-arq/src/main/java/org/apache/jena/riot/system/PrefixMapStd.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,21 @@

package org.apache.jena.riot.system;

import static org.apache.jena.riot.system.PrefixLib.canonicalPrefix;
import static org.apache.jena.riot.system.PrefixLib.isSafeLocalPart;

import java.util.Collections ;
import java.util.Map ;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.Optional;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;

import org.apache.jena.atlas.lib.Pair ;
import org.apache.jena.atlas.lib.Pair;
import org.apache.jena.atlas.lib.Trie;
import org.apache.jena.ext.com.google.common.base.Stopwatch;
import org.apache.jena.ext.com.google.common.cache.Cache;
import org.apache.jena.ext.com.google.common.cache.CacheBuilder;
import org.apache.jena.sparql.graph.PrefixMappingBase;

/**
* In-memory implementation of a {@link PrefixMap}.
Expand All @@ -45,134 +51,246 @@
* copy is cheaper than repeated reverse lookups would be.
*/
public class PrefixMapStd extends PrefixMapBase {
// Expansion map
final Map<String, String> prefixes = new ConcurrentHashMap<>();

// Immutable view of prefixes
private final Map<String, String> prefixes2 = Collections.unmodifiableMap(prefixes);
public static final int DFT_CACHE_SIZE = 1000;

// Abbreviation map used for common cases.
// This keeps the URI->prefix mappings for a computed guess at the answer, before
// resorting to a full search. See abbrev(String) below.
private final Map<String, String> uriToPrefix = new ConcurrentHashMap<>();
private Map<String, String> prefixToIri = new LinkedHashMap<>();
private Map<String, String> prefixToIriView = Collections.unmodifiableMap(prefixToIri);

/**
* Creates a new empty prefix mapping
*/
public PrefixMapStd() {}
/** A trie for longest prefix lookups */
private Trie<String> iriToPrefixTrie = new Trie<>();

/**
* Creates a new prefix mapping copied from an existing map
* @param prefixMap Prefix Map
*/
/** For exact matches of IRI strings the map is much faster than the trie */
private Map<String, String> iriToPrefixMap = new HashMap<>();

/** A cache for mapping iris to prefixes.
* Wrapping with Optional is needed because the Guava Cache does not allow for null values */
private Cache<String, Optional<String>> cache;

/** A generation counter that is incremented on modifications and which is
* used to invalidate the internal cache when needed.
* If generation and cacheVersion differ then the next prefix lookup will invalidate the cache and
/* set cacheVersion to generation */
private int generation = 0;
private int cacheVersion = 0;

public PrefixMapStd() {
this(DFT_CACHE_SIZE);
}

/** Copies the prefixes. Does not copy the cache. */
public PrefixMapStd(PrefixMap prefixMap) {
Objects.requireNonNull(prefixMap);
prefixes.putAll(prefixMap.getMapping());
this(DFT_CACHE_SIZE);
putAll(prefixMap);
}

@Override
public Map<String, String> getMapping() {
return prefixes2;
public PrefixMapStd(long longestMatchCacheSize) {
super();
this.cache = CacheBuilder.newBuilder().maximumSize(longestMatchCacheSize).build();
}

@Override
public String get(String prefix) {
Objects.requireNonNull(prefix);
prefix = canonicalPrefix(prefix);
return prefixes.get(prefix);
protected static String getPossibleKey(String iriString) {
int n = iriString.length();
int i;
for (i = n - 1; i >= 0; --i) {
char c = iriString.charAt(i);
if (c == '#' || c == '/') {
// We could add ':' here, it is used as a separator in URNs.
// But it is a multiple use character and always present in the scheme name.
// This is a fast-track guess so don't try guessing based on ':'.
break;
}
}
String result = i >= 0 ? iriString.substring(0, i + 1) : null;
return result;
}

protected String performPrefixLookup(String iriStr) {
String prefix = null;
String iriForPrefix = getPossibleKey(iriStr);
// Try fast track first - if it produces a hit then
// no overhead writing to the cache is needed
// The drawback is that we do not necessarily get the longest prefix
if (iriForPrefix != null) {
prefix = iriToPrefixMap.get(iriForPrefix);
}

// If no solution yet then search for longest prefix
if (prefix == null) {
prefix = cachedPrefixLookup(iriStr).orElse(null);
}
return prefix;
}

protected Optional<String> cachedPrefixLookup(String iri) {
if (cacheVersion != generation) {
cache.invalidateAll();
cacheVersion = generation;
}

Optional<String> prefix;
try {
prefix = cache.get(iri, () -> Optional.ofNullable(uncachedPrefixLookup(iri)));
} catch (ExecutionException e) {
throw new RuntimeException("Unexpected failure during cache lookup", e);
}
return prefix;
}

protected String uncachedPrefixLookup(String iriStr) {
String prefix = iriToPrefixTrie.longestMatch(iriStr);
return prefix;
}

@Override
public void add(String prefix, String iri) {
Objects.requireNonNull(prefix);
Objects.requireNonNull(iri);
prefix = canonicalPrefix(prefix);
String oldURI = prefixes.get(prefix);
if ( oldURI != null )
uriToPrefix.remove(oldURI);
prefixes.put(prefix, iri);
uriToPrefix.put(iri.toString(), prefix) ;
String canonicalPrefix = PrefixLib.canonicalPrefix(prefix);
String oldIri = prefixToIri.get(canonicalPrefix);
if (oldIri != null) {
iriToPrefixTrie.remove(oldIri);
iriToPrefixMap.remove(oldIri);
}
prefixToIri.put(canonicalPrefix, iri);
iriToPrefixTrie.add(iri, canonicalPrefix);
iriToPrefixMap.put(iri, canonicalPrefix);
++generation;
}

/** See notes on reverse mappings in {@link PrefixMappingBase}.
* This is a complete implementation.
* <p>
* Test {@code AbstractTestPrefixMapping.testSecondPrefixDeletedUncoversPreviousMap}.
*/
@Override
public void delete(String prefix) {
Objects.requireNonNull(prefix);
prefix = canonicalPrefix(prefix);
prefixes.remove(prefix);
// Remove from the abbreviation map.
uriToPrefix.values().remove(prefix);
String canonicalPrefix = PrefixLib.canonicalPrefix(prefix);
String iriForPrefix = prefixToIri.get(canonicalPrefix);
if (iriForPrefix != null) {
prefixToIri.remove(canonicalPrefix);
String prefixForIri = iriToPrefixMap.get(iriForPrefix);
if (canonicalPrefix.equals(prefixForIri)) {
iriToPrefixTrie.remove(iriForPrefix);
iriToPrefixMap.remove(prefixForIri);
}
}
++generation;
}

@Override
public void clear() {
prefixes.clear() ;
public Pair<String, String> abbrev(String iriStr) {
Objects.requireNonNull(iriStr);
Pair<String, String> result = null;

String prefix = performPrefixLookup(iriStr);
String iriForPrefix = prefix != null ? prefixToIri.get(prefix) : null;

// Post process a found solution
if (prefix != null && iriForPrefix != null) {
String localName = iriStr.substring(iriForPrefix.length());
if (PrefixLib.isSafeLocalPart(localName)) {
result = Pair.create(prefix, localName);
}
}
return result;
}

@Override
public boolean containsPrefix(String prefix) {
Objects.requireNonNull(prefix);
prefix = canonicalPrefix(prefix);
return prefixes.containsKey(prefix);
public String abbreviate(String iriStr) {
Objects.requireNonNull(iriStr);
String result = null;
Pair<String, String> prefixAndLocalName = abbrev(iriStr);
if (prefixAndLocalName != null) {
String prefix = prefixAndLocalName.getLeft();
String ln = prefixAndLocalName.getRight();
// Safe for RDF/XML as well
if (strSafeFor(ln, ':')) {
result = prefix + ":" + ln;
}
}
return result;
}

@Override
public String abbreviate(String uriStr) {
Objects.requireNonNull(uriStr);
Pair<String, String> p = abbrev(uriStr);
if (p == null)
return null;
return p.getLeft() + ":" + p.getRight();
public String get(String prefix) {
Objects.requireNonNull(prefix);
String canonicalPrefix = PrefixLib.canonicalPrefix(prefix);
return prefixToIri.get(canonicalPrefix);
}

// This is thread safe (does not crash) - it is not thread-consistent (answer
// uncertain if the prefix mappings are in flux).
@Override
public Pair<String, String> abbrev(String uriStr) {
Objects.requireNonNull(uriStr);
// Fast path.
// Look for a prefix by URI ending "#" or "/"
// then look for that as a known prefix.
String candidate = getPossibleKey(uriStr);
if ( candidate != null ) {
String uriForPrefix = uriToPrefix.get(candidate);
if ( uriForPrefix != null ) {
// Fast track.
String ln = uriStr.substring(candidate.length());
if ( isSafeLocalPart(ln) )
return Pair.create(uriForPrefix, ln);
}
}
// Not in the uri -> prefix map. Crunch it.
return PrefixLib.abbrev(prefixes, uriStr, true);
public Map<String, String> getMapping() {
return prefixToIriView;
}

/**
* Takes a guess for the namespace URI string to use in abbreviation.
* Finds the part of the IRI string before the last '#' or '/'.
*
* @param iriString String string
* @return String or null
*/
protected static String getPossibleKey(String iriString) {
int index = iriString.lastIndexOf('#');
if (index > -1)
return iriString.substring(0, index + 1);
index = iriString.lastIndexOf('/');
if (index > -1)
return iriString.substring(0, index + 1);
// We could add ':' here, it is used as a separator in URNs.
// But it is a multiple use character and always present in the scheme name.
// This is a fast-track guess so don't try guessing based on ':'.
return null;
@Override
public void clear() {
cache.invalidateAll();
prefixToIri.clear();
iriToPrefixTrie.clear();
iriToPrefixMap.clear();
++generation;
}

@Override
public boolean isEmpty() {
return prefixes.isEmpty();
return prefixToIri.isEmpty();
}

@Override
public int size() {
return prefixes.size();
return prefixToIri.size();
}

@Override
public boolean containsPrefix(String prefix) {
Objects.requireNonNull(prefix);
String canonicalPrefix = PrefixLib.canonicalPrefix(prefix);
return prefixToIri.containsKey(canonicalPrefix);
}

public static void main(String[] args) throws Exception {

String[][] baseIris = new String[2][];
baseIris[0] = new String[]{"http://example.org/", "/"};
baseIris[1] = new String[]{ "urn:foo:bar:", ":"};

for (int runId = 0; runId < 5; ++ runId) {
for (int baseId = 0; baseId < baseIris.length; ++baseId) {
String[] e = baseIris[baseId];
String baseIriStr = e[0];
String separator = e[1];

for(int approachId = 0; approachId < 2; ++approachId) {
// Select the prefix map implementation: 0 -> improved, 1 -> original
PrefixMap pm = approachId == 0 ? new PrefixMapStdOrig() : new PrefixMapStd();

// Initialize some prefixes
for (int i = 0; i < 2000; ++i) {
pm.add("ns" + i, baseIriStr + i + separator);
}

// Lookup with the same IRI (always cache hit)
Stopwatch sw = Stopwatch.createStarted();
String staticIri = baseIriStr + "1" + separator + "foobar";
for (int i = 0; i < 1000000; ++i) {
String abbr = pm.abbreviate(staticIri);
}
System.out.println(String.format("Run %d with base <%s> and separator %s using approach %d: Static IRI lookups took %.3f seconds", runId, baseIriStr, separator, approachId, sw.elapsed(TimeUnit.MILLISECONDS) * 0.001));

// Lookup with different IRIs
Stopwatch sw2 = Stopwatch.createStarted();
for (int i = 0; i < 1000000; ++i) {
String iriStr = baseIriStr + (i % 10000) + separator + "foobar";
String abbr = pm.abbreviate(iriStr);
}
System.out.println(String.format("Run %d with base <%s> and separator %s using approach %d: Dynamic IRI lookups took %.3f seconds", runId, baseIriStr, separator, approachId, sw2.elapsed(TimeUnit.MILLISECONDS) * 0.001));

System.out.println();
}
}
}
}
}
Loading

0 comments on commit b78216e

Please sign in to comment.