Skip to content

Commit

Permalink
Improved the readFreqComprList to now be more logical and usable
Browse files Browse the repository at this point in the history
  • Loading branch information
Flixtastic committed Jan 6, 2025
1 parent 75cf7d1 commit 5b29ca5
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 41 deletions.
43 changes: 22 additions & 21 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,9 +341,6 @@ void IndexImpl::createTextIndex(const string& filename,
const IndexImpl::TextVec& vec) {
ad_utility::File out(filename.c_str(), "w");
currenttOffset_ = 0;
std::vector<std::tuple<TextBlockIndex, TextRecordIndex, WordOrEntityIndex,
Score, bool>>
testVec(vec.begin(), vec.end());
// Detect block boundaries from the main key of the vec.
// Write the data for each block.
// First, there's the classic lists, then the additional entity ones.
Expand Down Expand Up @@ -586,21 +583,22 @@ IdTable IndexImpl::readWordCl(
idTable.resize(cids.size());
ql::ranges::transform(cids, idTable.getColumn(0).begin(),
&Id::makeFromTextRecordIndex);
ql::ranges::transform(
TextIndexReadWrite::readFreqComprList<WordIndex>(
ql::ranges::copy(
TextIndexReadWrite::readFreqComprList<Id, WordIndex>(
tbmd._cl._nofElements, tbmd._cl._startWordlist,
static_cast<size_t>(tbmd._cl._startScorelist -
tbmd._cl._startWordlist),
textIndexFile_),
idTable.getColumn(1).begin(), [](WordIndex id) {
return Id::makeFromWordVocabIndex(WordVocabIndex::make(id));
});
std::ranges::transform(TextIndexReadWrite::readFreqComprList<Score>(
tbmd._cl._nofElements, tbmd._cl._startScorelist,
static_cast<size_t>(tbmd._cl._lastByte + 1 -
tbmd._cl._startScorelist),
textIndexFile_),
idTable.getColumn(2).begin(), &Id::makeFromInt);
textIndexFile_,
[](WordIndex id) {
return Id::makeFromWordVocabIndex(WordVocabIndex::make(id));
}),
idTable.getColumn(1).begin());
std::ranges::copy(TextIndexReadWrite::readFreqComprList<Id, Score>(
tbmd._cl._nofElements, tbmd._cl._startScorelist,
static_cast<size_t>(tbmd._cl._lastByte + 1 -
tbmd._cl._startScorelist),
textIndexFile_, &Id::makeFromInt),
idTable.getColumn(2).begin());
return idTable;
}

Expand All @@ -619,19 +617,22 @@ IdTable IndexImpl::readWordEntityCl(
ql::ranges::transform(cids, idTable.getColumn(0).begin(),
&Id::makeFromTextRecordIndex);
ql::ranges::copy(
TextIndexReadWrite::readFreqComprList<Id>(
TextIndexReadWrite::readFreqComprList<Id, WordIndex>(
tbmd._entityCl._nofElements, tbmd._entityCl._startWordlist,
static_cast<size_t>(tbmd._entityCl._startScorelist -
tbmd._entityCl._startWordlist),
textIndexFile_, &Id::fromBits),
textIndexFile_,
[](uint64_t from) {
return Id::makeFromVocabIndex(VocabIndex::make(from));
}),
idTable.getColumn(1).begin());
ql::ranges::transform(
TextIndexReadWrite::readFreqComprList<Score>(
ql::ranges::copy(
TextIndexReadWrite::readFreqComprList<Id, Score>(
tbmd._entityCl._nofElements, tbmd._entityCl._startScorelist,
static_cast<size_t>(tbmd._entityCl._lastByte + 1 -
tbmd._entityCl._startScorelist),
textIndexFile_),
idTable.getColumn(2).begin(), &Id::makeFromInt);
textIndexFile_, &Id::makeFromInt),
idTable.getColumn(2).begin());
return idTable;
}

Expand Down
41 changes: 21 additions & 20 deletions src/index/TextIndexReadWrite.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,31 @@ void writeVectorAndMoveOffset(const std::vector<T>& vectorToWrite,
size_t nofElements, ad_utility::File& file,
off_t& currentOffset);

// The readFreqCompr and readGapCompr methods have to be in the header file
// because of the constexpr being evaluated at compile time
template <typename T, typename MakeFromUint64t = std::identity>
vector<T> readFreqComprList(size_t nofElements, off_t from, size_t nofBytes,
const ad_utility::File& textIndexFile,
MakeFromUint64t makeFromUint = MakeFromUint64t{}) {
// Read a freqComprList from the textIndexFile. The From specifies the type
// that was used to create the codebook in the writing step and the To
// specifies the type to cast that codebook values to. This is done with a
// static cast if no lambda function to cast is given.
template <typename To, typename From>
vector<To> readFreqComprList(
size_t nofElements, off_t from, size_t nofBytes,
const ad_utility::File& textIndexFile,
const std::function<To(From)>& transformer = [](From x) {
return static_cast<To>(x);
}) {
AD_CONTRACT_CHECK(nofBytes > 0);
LOG(DEBUG) << "Reading frequency-encoded list from disk...\n";
LOG(TRACE) << "NofElements: " << nofElements << ", from: " << from
<< ", nofBytes: " << nofBytes << '\n';
size_t nofCodebookBytes;
vector<T> result;
vector<uint64_t> frequencyEncodedResult;
uint64_t* encoded = new uint64_t[nofElements];
result.resize(nofElements + 250);
frequencyEncodedResult.resize(nofElements + 250);
off_t current = from;
size_t ret = textIndexFile.read(&nofCodebookBytes, sizeof(size_t), current);
LOG(TRACE) << "Nof Codebook Bytes: " << nofCodebookBytes << '\n';
AD_CONTRACT_CHECK(sizeof(size_t) == ret);
current += ret;
T* codebook = new T[nofCodebookBytes / sizeof(T)];
From* codebook = new From[nofCodebookBytes / sizeof(From)];
ret = textIndexFile.read(codebook, nofCodebookBytes, current);
current += ret;
AD_CONTRACT_CHECK(ret == size_t(nofCodebookBytes));
Expand All @@ -59,18 +64,14 @@ vector<T> readFreqComprList(size_t nofElements, off_t from, size_t nofBytes,
current += ret;
AD_CONTRACT_CHECK(size_t(current - from) == nofBytes);
LOG(DEBUG) << "Decoding Simple8b code...\n";
ad_utility::Simple8bCode::decode(encoded, nofElements, result.data(),
makeFromUint);
ad_utility::Simple8bCode::decode(encoded, nofElements,
frequencyEncodedResult.data());
LOG(DEBUG) << "Reverting frequency encoded items to actual IDs...\n";
result.resize(nofElements);
for (size_t i = 0; i < result.size(); ++i) {
// TODO<joka921> handle the strong ID types properly.
if constexpr (requires(T t) { t.getBits(); }) {
result[i] = Id::makeFromVocabIndex(
VocabIndex::make(codebook[result[i].getBits()].getBits()));
} else {
result[i] = codebook[result[i]];
}
frequencyEncodedResult.resize(nofElements);
vector<To> result;
result.reserve(frequencyEncodedResult.size());
for (size_t i = 0; i < frequencyEncodedResult.size(); ++i) {
result.push_back(transformer(codebook[frequencyEncodedResult[i]]));
}
delete[] encoded;
delete[] codebook;
Expand Down

0 comments on commit 5b29ca5

Please sign in to comment.