Skip to content

Commit

Permalink
Refactor things.
Browse files Browse the repository at this point in the history
TODO:
Make the vocabulary implementation be choosable from CMake

Signed-off-by: Johannes Kalmbach <[email protected]>
  • Loading branch information
joka921 committed Jan 31, 2025
1 parent e9e8dfd commit 79a11b6
Show file tree
Hide file tree
Showing 13 changed files with 85 additions and 87 deletions.
11 changes: 9 additions & 2 deletions src/engine/ExportQueryExecutionTrees.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,15 @@ ExportQueryExecutionTrees::getLiteralOrIriFromVocabIndex(
case Datatype::LocalVocabIndex:
return localVocab.getWord(id.getLocalVocabIndex()).asLiteralOrIri();
case Datatype::VocabIndex: {
auto entity = index.indexToString(id.getVocabIndex());
return LiteralOrIri::fromStringRepresentation(entity);
auto getEntity = [&index, id]() {
return index.indexToString(id.getVocabIndex());
};
// The type of entity might be `string_view` (If the vocabulary is stored
// uncompressed in RAM) or `string` (if it is on-disk, or compressed or
// both). The following code works and is efficient in all cases. In
// particular, the `std::string` constructor is compiled out because of
// RVO if `getEntity()` already returns a `string`.
return LiteralOrIri::fromStringRepresentation(std::string(getEntity()));
}
default:
AD_FAIL();
Expand Down
5 changes: 3 additions & 2 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,13 @@ size_t Index::getCardinality(
}

// ____________________________________________________________________________
std::string Index::indexToString(VocabIndex id) const {
auto Index::indexToString(VocabIndex id) const -> Vocab::AccessReturnType {
return pimpl_->indexToString(id);
}

// ____________________________________________________________________________
std::string_view Index::indexToString(WordVocabIndex id) const {
auto Index::indexToString(WordVocabIndex id) const
-> TextVocabulary::AccessReturnType {
return pimpl_->indexToString(id);
}

Expand Down
10 changes: 4 additions & 6 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,11 @@ class Index {
// Read necessary metadata into memory and open file handles.
void addTextFromOnDiskIndex();

using Vocab =
Vocabulary<CompressedString, TripleComponentComparator, VocabIndex>;
using Vocab = RdfsVocabulary;
[[nodiscard]] const Vocab& getVocab() const;
Vocab& getNonConstVocabForTesting();

using TextVocab =
Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
using TextVocab = TextVocabulary;
[[nodiscard]] const TextVocab& getTextVocab() const;

// Get a (non-owning) pointer to the BlankNodeManager of this Index.
Expand All @@ -132,8 +130,8 @@ class Index {

// TODO<joka921> Once we have an overview over the folding this logic should
// probably not be in the index class.
std::string indexToString(VocabIndex id) const;
std::string_view indexToString(WordVocabIndex id) const;
Vocab::AccessReturnType indexToString(VocabIndex id) const;
TextVocab::AccessReturnType indexToString(WordVocabIndex id) const;

[[nodiscard]] Vocab::PrefixRanges prefixRanges(std::string_view prefix) const;

Expand Down
8 changes: 4 additions & 4 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1522,13 +1522,13 @@ size_t IndexImpl::getCardinality(
}

// ___________________________________________________________________________
// TODO<joka921> Make this the return type of the vocabulary.
std::string IndexImpl::indexToString(VocabIndex id) const {
return std::string{vocab_[id]};
RdfsVocabulary::AccessReturnType IndexImpl::indexToString(VocabIndex id) const {
return vocab_[id];
}

// ___________________________________________________________________________
std::string_view IndexImpl::indexToString(WordVocabIndex id) const {
TextVocabulary::AccessReturnType IndexImpl::indexToString(
WordVocabIndex id) const {
return textVocab_[id];
}

Expand Down
4 changes: 2 additions & 2 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,10 @@ class IndexImpl {
const LocatedTriplesSnapshot& locatedTriplesSnapshot) const;

// ___________________________________________________________________________
std::string indexToString(VocabIndex id) const;
RdfsVocabulary::AccessReturnType indexToString(VocabIndex id) const;

// ___________________________________________________________________________
std::string_view indexToString(WordVocabIndex id) const;
TextVocabulary::AccessReturnType indexToString(WordVocabIndex id) const;

public:
// ___________________________________________________________________________
Expand Down
1 change: 0 additions & 1 deletion src/index/StringSortComparator.h
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,6 @@ class TripleComponentComparator {
return compare(spA, spB, level) < 0;
}

// TODO<joka921> Unify these three functions.
bool operator()(const SplitVal& spA, std::string_view b,
const Level level) const {
auto spB = extractAndTransformComparable(b, level, false);
Expand Down
42 changes: 12 additions & 30 deletions src/index/Vocabulary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,8 @@ bool Vocabulary<StringType, ComparatorType, IndexT>::PrefixRanges::contain(
// _____________________________________________________________________________
template <class S, class C, typename I>
void Vocabulary<S, C, I>::readFromFile(const string& fileName) {
LOG(INFO) << "Reading vocabulary from file " << fileName << " ..."
<< std::endl;
vocabulary_.close();
vocabulary_.open(fileName);
if constexpr (isCompressed_ && false) {
const auto& internalExternalVocab =
vocabulary_.getUnderlyingVocabulary().getUnderlyingVocabulary();
LOG(INFO) << "Done, number of words: "
<< internalExternalVocab.internalVocab().size() << std::endl;
LOG(INFO) << "Number of words in external vocabulary: "
<< internalExternalVocab.externalVocab().size() << std::endl;
} else {
LOG(INFO) << "Done, number of words: " << vocabulary_.size() << std::endl;
}

// Precomputing ranges for IRIs, blank nodes, and literals, for faster
// processing of the `isIrI` and `isLiteral` functions.
Expand Down Expand Up @@ -88,19 +76,12 @@ bool Vocabulary<S, C, I>::stringIsLiteral(std::string_view s) {
// _____________________________________________________________________________
template <class S, class C, class I>
bool Vocabulary<S, C, I>::shouldBeExternalized(string_view s) const {
// TODO<joka921> Completely refactor the Vocabulary on the different
// Types, it is a mess.

// If the string is not compressed, this means that this is a text vocabulary
// and thus doesn't support externalization.
if constexpr (std::is_same_v<S, CompressedString>) {
if (!stringIsLiteral(s)) {
return shouldEntityBeExternalized(s);
} else {
return shouldLiteralBeExternalized(s);
}
// TODO<joka921> We should have a completely separate layer that handles the
// externalization, not the Vocab.
if (!stringIsLiteral(s)) {
return shouldEntityBeExternalized(s);
} else {
return false;
return shouldLiteralBeExternalized(s);
}
}

Expand Down Expand Up @@ -264,17 +245,18 @@ auto Vocabulary<S, C, I>::prefixRanges(std::string_view prefix) const
}

// _____________________________________________________________________________
template <typename S, typename C, typename I>
auto Vocabulary<S, C, I>::operator[](IndexType idx) const
-> AccessReturnType_t<S> {
template <typename UnderlyingVocabulary, typename C, typename I>
auto Vocabulary<UnderlyingVocabulary, C, I>::operator[](IndexType idx) const
-> AccessReturnType {
AD_CONTRACT_CHECK(idx.get() < size());
return vocabulary_[idx.get()];
}

// Explicit template instantiations
template class Vocabulary<CompressedString, TripleComponentComparator,
VocabIndex>;
template class Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
template class Vocabulary<detail::UnderlyingVocabRdfsVocabulary,
TripleComponentComparator, VocabIndex>;
template class Vocabulary<detail::UnderlyingVocabTextVocabulary,
SimpleStringComparator, WordVocabIndex>;

template void RdfsVocabulary::initializeInternalizedLangs<nlohmann::json>(
const nlohmann::json&);
Expand Down
57 changes: 25 additions & 32 deletions src/index/Vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,6 @@
using std::string;
using std::vector;

template <class StringType>
using AccessReturnType_t = std::string_view;
/*
std::conditional_t<std::is_same_v<StringType, CompressedString>,
std::string, std::string_view>;
*/

template <typename IndexT = WordVocabIndex>
class IdRange {
public:
Expand All @@ -69,9 +62,15 @@ inline std::ostream& operator<<(std::ostream& stream,
// retrieval. Template parameters that are supported are:
// std::string -> no compression is applied
// CompressedString -> prefix compression is applied
template <typename StringType, typename ComparatorType, typename IndexT>
template <typename UnderlyingVocabulary, typename ComparatorType,
typename IndexT>
class Vocabulary {
public:
// The type that is returned by the `operator[]` of this vocabulary. Typically
// either `std::string` or `std::string_view`.
using AccessReturnType =
decltype(std::declval<const UnderlyingVocabulary&>()[0]);

// The index ranges for a prefix + a function to check whether a given index
// is contained in one of them.
//
Expand All @@ -96,17 +95,6 @@ class Vocabulary {
// The different type of data that is stored in the vocabulary
enum class Datatypes { Literal, Iri, Float, Date };

template <typename T, typename R = void>
using enable_if_compressed =
std::enable_if_t<std::is_same_v<T, CompressedString>>;

template <typename T, typename R = void>
using enable_if_uncompressed =
std::enable_if_t<!std::is_same_v<T, CompressedString>>;

static constexpr bool isCompressed_ =
std::is_same_v<StringType, CompressedString>;

// If a literal uses one of these language tags or starts with one of these
// prefixes, it will be externalized. By default, everything is externalized.
// Both of these settings can be overridden using the `settings.json` file.
Expand All @@ -116,13 +104,19 @@ class Vocabulary {
vector<std::string> internalizedLangs_;
vector<std::string> externalizedPrefixes_{""};

using UnderlyingVocabulary = VocabularyInMemory;
// using UnderlyingVocabulary = VocabularyInMemory;
/*
using UnderlyingVocabulary =
std::conditional_t<isCompressed_,
CompressedVocabulary<VocabularyInternalExternal>,
VocabularyInMemory>;
*/
/*
using UnderlyingVocabulary =
std::conditional_t<isCompressed_,
CompressedVocabulary<VocabularyInMemory>,
VocabularyInMemory>;
*/
using VocabularyWithUnicodeComparator =
UnicodeVocabulary<UnderlyingVocabulary, ComparatorType>;

Expand All @@ -137,10 +131,7 @@ class Vocabulary {
using SortLevel = typename ComparatorType::Level;
using IndexType = IndexT;

template <
typename = std::enable_if_t<std::is_same_v<StringType, string> ||
std::is_same_v<StringType, CompressedString>>>
Vocabulary() {}
Vocabulary() = default;
Vocabulary& operator=(Vocabulary&&) noexcept = default;
Vocabulary(Vocabulary&&) noexcept = default;

Expand All @@ -151,10 +142,7 @@ class Vocabulary {

// Get the word with the given `idx`. Throw if the `idx` is not contained
// in the vocabulary.
AccessReturnType_t<StringType> operator[](IndexType idx) const;

// AccessReturnType_t<StringType> at(IndexType idx) const { return
// operator[](id); }
AccessReturnType operator[](IndexType idx) const;

//! Get the number of words in the vocabulary.
[[nodiscard]] size_t size() const { return vocabulary_.size(); }
Expand Down Expand Up @@ -247,7 +235,12 @@ class Vocabulary {
}
};

using RdfsVocabulary =
Vocabulary<CompressedString, TripleComponentComparator, VocabIndex>;
using TextVocabulary =
Vocabulary<std::string, SimpleStringComparator, WordVocabIndex>;
namespace detail {
using UnderlyingVocabRdfsVocabulary = VocabularyInMemory;
using UnderlyingVocabTextVocabulary = VocabularyInMemory;
} // namespace detail

using RdfsVocabulary = Vocabulary<detail::UnderlyingVocabRdfsVocabulary,
TripleComponentComparator, VocabIndex>;
using TextVocabulary = Vocabulary<detail::UnderlyingVocabTextVocabulary,
SimpleStringComparator, WordVocabIndex>;
3 changes: 3 additions & 0 deletions src/index/vocabulary/VocabularyInMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ using std::string;

// _____________________________________________________________________________
void VocabularyInMemory::open(const string& fileName) {
LOG(INFO) << "Reading vocabulary from file " << fileName << " ..."
<< std::endl;
_words.clear();
ad_utility::serialization::FileReadSerializer file(fileName);
file >> _words;
LOG(INFO) << "Done, number of words: " << size() << std::endl;
}

// _____________________________________________________________________________
Expand Down
6 changes: 6 additions & 0 deletions src/index/vocabulary/VocabularyInMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,18 @@ class VocabularyInMemory
struct WordWriter {
typename Words::Writer writer_;
explicit WordWriter(const std::string& filename) : writer_{filename} {}

// Write a word. The `isExternalDummy` is only there to have a consistent
// interface with the `VocabularyInternalExternal`.
void operator()(std::string_view str,
[[maybe_unused]] bool isExternalDummy = false) {
writer_.push(str.data(), str.size());
}

void finish() { writer_.finish(); }

// The `readableName()` function is only there to have a consistent
// interface with the `VocabularyInternalExternal`.
std::string readableNameDummy_;
std::string& readableName() { return readableNameDummy_; }
};
Expand Down
12 changes: 12 additions & 0 deletions src/index/vocabulary/VocabularyInternalExternal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ void VocabularyInternalExternal::WordWriter::finish() {
internalWriter_.finish();
externalWriter_.finish();
}

// _____________________________________________________________________________
void VocabularyInternalExternal::open(const string& filename) {
LOG(INFO) << "Reading vocabulary from file " << filename << " ..."
<< std::endl;
internalVocab_.open(filename + ".internal");
externalVocab_.open(filename + ".external");
LOG(INFO) << "Done, number of words: " << size() << std::endl;
LOG(INFO) << "Number of words in internal vocabulary (these are also part "
"of the external vocabulary): "
<< internalVocab_.size() << std::endl;
}
5 changes: 1 addition & 4 deletions src/index/vocabulary/VocabularyInternalExternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@ class VocabularyInternalExternal {

// Read the vocabulary from a file. The file must have been created using a
// `WordWriter`.
void open(const string& filename) {
internalVocab_.open(filename + ".internal");
externalVocab_.open(filename + ".external");
}
void open(const string& filename);

// Return the total number of words
[[nodiscard]] size_t size() const { return externalVocab_.size(); }
Expand Down
8 changes: 4 additions & 4 deletions test/engine/TextIndexScanTestHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ inline string getTextRecordFromResultTable(const QueryExecutionContext* qec,
result.idTable().getColumn(0)[rowIndex].getTextRecordIndex().get();
if (nofNonLiterals <= textRecordIdFromTable) {
// Return when from Literals
return qec->getIndex().indexToString(
VocabIndex::make(textRecordIdFromTable - nofNonLiterals));
return std::string{qec->getIndex().indexToString(
VocabIndex::make(textRecordIdFromTable - nofNonLiterals))};
} else {
// Return when from DocsDB
return qec->getIndex().getTextExcerpt(
Expand All @@ -41,8 +41,8 @@ inline const TextRecordIndex getTextRecordIdFromResultTable(
inline string getEntityFromResultTable(const QueryExecutionContext* qec,
const ProtoResult& result,
const size_t& rowIndex) {
return qec->getIndex().indexToString(
result.idTable().getColumn(1)[rowIndex].getVocabIndex());
return std::string{qec->getIndex().indexToString(
result.idTable().getColumn(1)[rowIndex].getVocabIndex())};
}

// Only use on prefix search results
Expand Down

0 comments on commit 79a11b6

Please sign in to comment.