diff --git a/benchmark/src/benchmarks/benchmark.stringpool.cpp b/benchmark/src/benchmarks/benchmark.stringpool.cpp index 2d82adc..73106fa 100644 --- a/benchmark/src/benchmarks/benchmark.stringpool.cpp +++ b/benchmark/src/benchmarks/benchmark.stringpool.cpp @@ -1,73 +1,136 @@ #include +#include #include namespace zasm::benchmarks { - static const std::string TestStrings[] = { "hello", "world", "longer string", "even longer string", - "even longer longer string" }; + static constexpr auto kTestSize = 500'000; + static constexpr const char kChars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - static void BM_StringPool_Aquire(benchmark::State& state) + static const std::vector kInputStrings = []() { + std::vector strings; + + std::mt19937 prng(42); + for (int i = 0; i < kTestSize; ++i) + { + std::string str; + for (size_t i = 0; i < 4 + (prng() % 24); ++i) + { + str.push_back(kChars[prng() % (sizeof(kChars) - 1)]); + } + strings.push_back(std::move(str)); + } + return strings; + }(); + + static void BM_StringPool_Acquire(benchmark::State& state) { StringPool pool; for (auto _ : state) { - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - benchmark::DoNotOptimize(stringId); + auto stringId = pool.acquire(str); + benchmark::DoNotOptimize(stringId); + } } } - BENCHMARK(BM_StringPool_Aquire)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_Acquire)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_Release(benchmark::State& state) { StringPool pool; for (auto _ : state) { + for (auto i = 0; i < state.range(0); ++i) + { + state.PauseTiming(); + const auto& str = kInputStrings[i]; + + auto stringId = pool.acquire(str); + state.ResumeTiming(); + + auto refCount = pool.release(stringId); + benchmark::DoNotOptimize(refCount); + } + } + } + BENCHMARK(BM_StringPool_Release)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); + + static void BM_StringPool_Reuse(benchmark::State& state) + { + StringPool pool; + for (auto _ : state) + { + std::vector stringIds; + state.PauseTiming(); - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + const auto& str = kInputStrings[i]; + + auto stringId = pool.acquire(str); + stringIds.push_back(stringId); + } - auto stringId = pool.aquire(str); + // Clear. + for (auto i = 0; i < state.range(0); ++i) + { + auto stringId = pool.release(stringIds[i]); + } state.ResumeTiming(); - auto refCount = pool.release(stringId); - benchmark::DoNotOptimize(refCount); + for (auto i = 0; i < state.range(0); ++i) + { + const auto& str = kInputStrings[i]; + + auto stringId = pool.acquire(str); + benchmark::DoNotOptimize(stringId); + } } } - BENCHMARK(BM_StringPool_Release)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_Reuse)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_Get(benchmark::State& state) { StringPool pool; for (auto _ : state) { - state.PauseTiming(); - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + state.PauseTiming(); + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - state.ResumeTiming(); + auto stringId = pool.acquire(str); + state.ResumeTiming(); - const char* res = pool.get(stringId); - benchmark::DoNotOptimize(res); + const char* res = pool.get(stringId); + benchmark::DoNotOptimize(res); + } } } - BENCHMARK(BM_StringPool_Get)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_Get)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); static void BM_StringPool_GetLength(benchmark::State& state) { StringPool pool; for (auto _ : state) { - state.PauseTiming(); - const auto& str = TestStrings[state.range(0)]; + for (auto i = 0; i < state.range(0); ++i) + { + state.PauseTiming(); + const auto& str = kInputStrings[i]; - auto stringId = pool.aquire(str); - state.ResumeTiming(); + auto stringId = pool.acquire(str); + state.ResumeTiming(); - auto strLen = pool.getLength(stringId); - benchmark::DoNotOptimize(strLen); + auto strLen = pool.getLength(stringId); + benchmark::DoNotOptimize(strLen); + } } } - BENCHMARK(BM_StringPool_GetLength)->DenseRange(0, std::size(TestStrings) - 1); + BENCHMARK(BM_StringPool_GetLength)->Range(0, kTestSize)->Unit(benchmark::kMillisecond); } // namespace zasm::benchmarks diff --git a/tests/src/tests/tests.stringpool.cpp b/tests/src/tests/tests.stringpool.cpp index 5630da1..2a9d738 100644 --- a/tests/src/tests/tests.stringpool.cpp +++ b/tests/src/tests/tests.stringpool.cpp @@ -1,4 +1,7 @@ #include +#include + +#define IN_TESTS #include namespace zasm::tests @@ -10,7 +13,7 @@ namespace zasm::tests constexpr const char str1[] = "hello"; constexpr const char str2[] = "Hello"; - const auto id0 = pool.aquire(str1); + const auto id0 = pool.acquire(str1); ASSERT_NE(id0, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1); ASSERT_EQ(pool.getRefCount(id0), 1); @@ -18,7 +21,7 @@ namespace zasm::tests ASSERT_NE(cstr0, nullptr); ASSERT_EQ(strcmp(cstr0, str1), 0); - const auto id1 = pool.aquire(str1); + const auto id1 = pool.acquire(str1); ASSERT_NE(id1, StringPool::Id::Invalid); ASSERT_EQ(id1, id0); ASSERT_EQ(pool.getLength(id1), std::size(str1) - 1); @@ -27,7 +30,7 @@ namespace zasm::tests ASSERT_EQ(cstr1, cstr0); ASSERT_EQ(strcmp(cstr1, str1), 0); - const auto id2 = pool.aquire(str2); + const auto id2 = pool.acquire(str2); ASSERT_NE(id2, StringPool::Id::Invalid); ASSERT_NE(id2, id1); ASSERT_EQ(pool.getLength(id2), std::size(str2) - 1); @@ -36,6 +39,48 @@ namespace zasm::tests ASSERT_EQ(strcmp(cstr2, str2), 0); } + TEST(StringPoolTests, TestDuplicate) + { + StringPool pool; + + constexpr const char str1[] = "hello"; + constexpr const char str2[] = "hello"; + constexpr const char str3[] = "hello"; + constexpr const char str4[] = "hello1"; + + const auto id0 = pool.acquire(str1); + ASSERT_NE(id0, StringPool::Id::Invalid); + ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1); + ASSERT_EQ(pool.getRefCount(id0), 1); + const auto* cstr0 = pool.get(id0); + ASSERT_NE(cstr0, nullptr); + ASSERT_EQ(strcmp(cstr0, str1), 0); + + const auto id1 = pool.acquire(str2); + ASSERT_NE(id1, StringPool::Id::Invalid); + ASSERT_EQ(id1, id0); + ASSERT_EQ(pool.getLength(id1), std::size(str2) - 1); + ASSERT_EQ(pool.getRefCount(id1), 2); + const auto* cstr2 = pool.get(id1); + ASSERT_EQ(strcmp(cstr2, str2), 0); + + const auto id2 = pool.acquire(str3); + ASSERT_NE(id2, StringPool::Id::Invalid); + ASSERT_EQ(id2, id0); + ASSERT_EQ(pool.getLength(id2), std::size(str3) - 1); + ASSERT_EQ(pool.getRefCount(id2), 3); + const auto* cstr3 = pool.get(id2); + ASSERT_EQ(strcmp(cstr3, str3), 0); + + const auto id3 = pool.acquire(str4); + ASSERT_NE(id3, StringPool::Id::Invalid); + ASSERT_NE(id3, id0); + ASSERT_EQ(pool.getLength(id3), std::size(str4) - 1); + ASSERT_EQ(pool.getRefCount(id3), 1); + const auto* cstr4 = pool.get(id3); + ASSERT_EQ(strcmp(cstr4, str4), 0); + } + TEST(StringPoolTests, TestRelease) { StringPool pool; @@ -46,17 +91,17 @@ namespace zasm::tests constexpr const char str4[] = "hello4"; constexpr const char str5[] = "hello..."; - const auto id0 = pool.aquire(str1); + const auto id0 = pool.acquire(str1); ASSERT_NE(id0, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id0), std::size(str1) - 1); ASSERT_EQ(pool.getRefCount(id0), 1); - const auto id1 = pool.aquire(str2); + const auto id1 = pool.acquire(str2); ASSERT_NE(id1, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id1), std::size(str2) - 1); ASSERT_EQ(pool.getRefCount(id1), 1); - const auto id2 = pool.aquire(str3); + const auto id2 = pool.acquire(str3); ASSERT_NE(id2, StringPool::Id::Invalid); ASSERT_EQ(pool.getLength(id2), std::size(str3) - 1); ASSERT_EQ(pool.getRefCount(id2), 1); @@ -65,13 +110,13 @@ namespace zasm::tests const auto* cstr0 = pool.get(id1); ASSERT_EQ(cstr0, nullptr); - const auto id3 = pool.aquire(str4); + const auto id3 = pool.acquire(str4); ASSERT_NE(id3, StringPool::Id::Invalid); ASSERT_EQ(id1, id3); ASSERT_EQ(pool.getLength(id3), std::size(str4) - 1); ASSERT_EQ(pool.getRefCount(id3), 1); - const auto id4 = pool.aquire(str4); + const auto id4 = pool.acquire(str4); ASSERT_NE(id4, StringPool::Id::Invalid); ASSERT_EQ(id4, id3); ASSERT_EQ(pool.getLength(id4), std::size(str4) - 1); @@ -80,11 +125,146 @@ namespace zasm::tests ASSERT_EQ(pool.release(id4), 1); ASSERT_EQ(pool.release(id4), 0); - const auto id5 = pool.aquire(str5); + const auto id5 = pool.acquire(str5); ASSERT_NE(id5, StringPool::Id::Invalid); - ASSERT_NE(id5, id1); + ASSERT_EQ(id5, id1); ASSERT_EQ(pool.getLength(id5), std::size(str5) - 1); ASSERT_EQ(pool.getRefCount(id5), 1); } + constexpr auto kTestSize = 100'000; + + static constexpr const char kChars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + + static const std::vector kInputStrings = []() { + std::vector strings; + std::set uniqueStrings; + std::mt19937 prng(42); + while (uniqueStrings.size() < kTestSize) + { + std::string str; + for (size_t i = 0; i < 4 + (prng() % 24); ++i) + { + str.push_back(kChars[prng() % (sizeof(kChars) - 1)]); + } + uniqueStrings.emplace(std::move(str)); + } + strings.insert(strings.end(), uniqueStrings.begin(), uniqueStrings.end()); + return strings; + }(); + + const auto fillPool = [](StringPool& pool, std::vector& ids) { + ids.clear(); + for (const auto& str : kInputStrings) + { + const auto id = pool.acquire(str); + ASSERT_NE(id, StringPool::Id::Invalid); + ASSERT_EQ(pool.getLength(id), str.size()); + ASSERT_EQ(pool.getRefCount(id), 1); + + const auto* cstr = pool.get(id); + ASSERT_NE(cstr, nullptr); + ASSERT_EQ(strcmp(cstr, str.c_str()), 0); + + ids.push_back(id); + } + ASSERT_EQ(ids.size(), kInputStrings.size()); + }; + + TEST(StringPoolTests, TestManyStrings) + { + StringPool pool; + std::vector ids; + + fillPool(pool, ids); + + // Validate that the contents still match. + for (size_t i = 0; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr); + ASSERT_EQ(strcmp(cstr, str.c_str()), 0); + } + } + + TEST(StringPoolTests, TestClearIndividually) + { + StringPool pool; + std::vector ids; + + fillPool(pool, ids); + + // Clear. + for (auto id : ids) + { + ASSERT_EQ(pool.release(id), 0); + } + + ASSERT_EQ(pool.size(), 0); + } + + TEST(StringPoolTests, TestReuseSingle) + { + StringPool pool; + std::vector ids; + + fillPool(pool, ids); + + // Validate strings. + for (auto i = 0U; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr) << "i = " << i; + ASSERT_EQ(strcmp(cstr, str.c_str()), 0) << "i = " << i; + } + + // Clear. + for (auto id : ids) + { + ASSERT_EQ(pool.release(id), 0); + } + + fillPool(pool, ids); + + // Validate strings. + for (auto i = 0U; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr) << "i = " << i; + ASSERT_EQ(strcmp(cstr, str.c_str()), 0) << "i = " << i; + } + } + + TEST(StringPoolTests, TestReuseMultiple) + { + StringPool pool; + std::vector ids; + + for (size_t i = 0; i < 5; i++) + { + fillPool(pool, ids); + + // Validate that the contents still match. + for (auto i = 0U; i < kTestSize; i++) + { + const auto& str = kInputStrings[i]; + + const auto* cstr = pool.get(ids[i]); + ASSERT_NE(cstr, nullptr); + ASSERT_EQ(strcmp(cstr, str.c_str()), 0); + } + + // Clear. + for (auto id : ids) + { + ASSERT_EQ(pool.release(id), 0); + } + } + } + } // namespace zasm::tests diff --git a/zasm/include/zasm/core/stringpool.hpp b/zasm/include/zasm/core/stringpool.hpp index bd413cb..849f6f4 100644 --- a/zasm/include/zasm/core/stringpool.hpp +++ b/zasm/include/zasm/core/stringpool.hpp @@ -1,9 +1,10 @@ #pragma once #include +#include #include #include -#include +#include #include #include #include @@ -16,37 +17,79 @@ namespace zasm class StringPool { + public: + enum class Id : std::int32_t + { + Invalid = -1, + }; + + static constexpr std::size_t kMaxStringSize = 0xFFFF; + static constexpr std::size_t kAverageStringSize = 24; + static constexpr std::size_t kStringsPerBlock = 4096; + static constexpr std::size_t kBlockSize = kAverageStringSize * kStringsPerBlock; + static constexpr std::size_t kMinStringCapacity = 16; + static constexpr std::size_t kUnspecifiedSize = ~std::size_t{ 0 }; + + static_assert(kBlockSize >= kMaxStringSize, "Block size must be bigger than max string size."); + + // 16 bit prime number. + static constexpr std::size_t kMaxHashBuckets = 39119; + + private: + using StringSize = std::conditional_t< + kMaxStringSize <= std::numeric_limits::max(), std::uint16_t, std::uint32_t>; + + using BlockOffset = std::conditional_t< + kBlockSize <= std::numeric_limits::max(), std::uint32_t, std::uint64_t>; + + using BlockIndex = std::uint32_t; + struct Entry { - std::size_t hash{}; - std::int32_t offset{}; - std::int32_t len{}; - std::int32_t capacity{}; + std::uint64_t hash{}; + BlockIndex blockIndex{}; + BlockOffset offsetInBlock{}; + StringSize len{}; + StringSize capacity{}; std::int32_t refCount{}; + Id nextFreeId{ Id::Invalid }; }; std::vector _entries; - std::vector _data; + std::vector> _hashBuckets; + Id _nextFreeId{ Id::Invalid }; + std::size_t _numFree{}; - public: - enum class Id : std::int32_t + struct Block { - Invalid = -1, + BlockIndex index{}; + std::uint32_t used{}; + std::array data{}; }; + std::vector> _blocks; + public: - // NOLINTNEXTLINE - template Id aquire(const char (&value)[N]) + StringPool() + { + _hashBuckets.resize(kMaxHashBuckets); + } + + Id acquire(const char* value, std::size_t size = kUnspecifiedSize) { - return aquire_(value, N); + if (size == kUnspecifiedSize) + { + size = std::strlen(value); + } + return aquire_(value, size); } - Id aquire(const char* value) + Id acquire(std::string_view str) { - return aquire_(value, strlen(value)); + return aquire_(str.data(), str.size()); } - Id aquire(const std::string& val) + Id acquire(const std::string& val) { return aquire_(val.c_str(), val.size()); } @@ -60,14 +103,28 @@ namespace zasm } const auto newRefCount = --entry->refCount; + assert(newRefCount >= 0); + + if (newRefCount == 0) + { + entry->nextFreeId = _nextFreeId; + _nextFreeId = stringId; + _numFree++; + + const auto bucketIndex = entry->hash % kMaxHashBuckets; + auto& bucket = _hashBuckets[bucketIndex]; + + bucket.erase(std::remove(bucket.begin(), bucket.end(), stringId), bucket.end()); + } + return newRefCount; } - // NOLINTNEXTLINE - template Id find(const char (&value)[N]) const noexcept + Id find(const char* str) const noexcept { - const auto hash = getHash(value, N); - return find(value, N, hash); + const auto len = std::strlen(str); + const auto hash = getHash(str, len); + return find_(str, len, hash); } bool isValid(Id stringId) const noexcept @@ -84,10 +141,10 @@ namespace zasm return nullptr; } - return _data.data() + entry->offset; + return _blocks[entry->blockIndex]->data.data() + entry->offsetInBlock; } - int32_t getLength(Id stringId) const noexcept + std::size_t getLength(Id stringId) const noexcept { const auto* entry = getEntry(*this, stringId); if (entry == nullptr) @@ -110,22 +167,33 @@ namespace zasm void clear() noexcept { + if (_entries.empty()) + { + // Because clearing the buckets is expensive do nothing if already empty. + return; + } _entries.clear(); - _data.clear(); - } - - const char* data() const noexcept - { - return _data.data(); + if (_blocks.size() > 1) + { + _blocks.resize(1); + _blocks[0]->used = 0; + } + _nextFreeId = Id::Invalid; + _numFree = 0; + for (auto& bucket : _hashBuckets) + { + bucket.clear(); + } } std::size_t size() const noexcept { - return _entries.size(); + return _entries.size() - _numFree; } Error save(IStream& stream) const { + // Serialize entries. const auto entryCount = static_cast(_entries.size()); if (auto len = stream.write(&entryCount, sizeof(entryCount)); len == 0) { @@ -138,7 +206,11 @@ namespace zasm { return ErrorCode::InvalidParameter; } - if (auto len = stream.write(entry.offset); len == 0) + if (auto len = stream.write(entry.blockIndex); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(entry.offsetInBlock); len == 0) { return ErrorCode::InvalidParameter; } @@ -154,17 +226,54 @@ namespace zasm { return ErrorCode::InvalidParameter; } + if (auto len = stream.write(entry.nextFreeId); len == 0) + { + return ErrorCode::InvalidParameter; + } } - const auto dataSize = static_cast(_data.size()); - if (auto len = stream.write(dataSize); len == 0) + // Serialize free entries. + if (auto len = stream.write(&_nextFreeId, sizeof(_nextFreeId)); len == 0) { return ErrorCode::InvalidParameter; } - if (dataSize > 0) + // Serialize hash buckets. + for (const auto& bucket : _hashBuckets) { - if (auto len = stream.write(_data.data(), dataSize); len == 0) + const auto bucketSize = static_cast(bucket.size()); + if (auto len = stream.write(&bucketSize, sizeof(bucketSize)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + for (const auto& id : bucket) + { + if (auto len = stream.write(id); len == 0) + { + return ErrorCode::InvalidParameter; + } + } + } + + // Serialize blocks. + const auto blockCount = static_cast(_blocks.size()); + if (auto len = stream.write(&blockCount, sizeof(blockCount)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + for (const auto& block : _blocks) + { + if (auto len = stream.write(block->index); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(block->used); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.write(block->data.data(), block->used); len == 0) { return ErrorCode::InvalidParameter; } @@ -175,8 +284,7 @@ namespace zasm Error load(IStream& stream) { - clear(); - + // Deserialize entries. std::uint32_t entryCount{}; if (auto len = stream.read(&entryCount, sizeof(entryCount)); len == 0) { @@ -191,7 +299,11 @@ namespace zasm { return ErrorCode::InvalidParameter; } - if (auto len = stream.read(entry.offset); len == 0) + if (auto len = stream.read(entry.blockIndex); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(entry.offsetInBlock); len == 0) { return ErrorCode::InvalidParameter; } @@ -207,26 +319,80 @@ namespace zasm { return ErrorCode::InvalidParameter; } + if (auto len = stream.read(entry.nextFreeId); len == 0) + { + return ErrorCode::InvalidParameter; + } + } + + // Deserialize free entries. + Id nextFreeEntry{ Id::Invalid }; + if (auto len = stream.read(&nextFreeEntry, sizeof(nextFreeEntry)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + // Deserialize hash buckets. + auto hashBuckets = std::vector>(); + hashBuckets.resize(kMaxHashBuckets); + for (auto& bucket : hashBuckets) + { + std::uint32_t bucketSize{}; + if (auto len = stream.read(&bucketSize, sizeof(bucketSize)); len == 0) + { + return ErrorCode::InvalidParameter; + } + + bucket.resize(bucketSize); + for (auto& id : bucket) + { + if (auto len = stream.read(id); len == 0) + { + return ErrorCode::InvalidParameter; + } + } } - std::uint32_t dataSize{}; - if (auto len = stream.read(&dataSize, sizeof(dataSize)); len == 0) + // Deserialize blocks. + std::uint32_t blockCount{}; + if (auto len = stream.read(&blockCount, sizeof(blockCount)); len == 0) { return ErrorCode::InvalidParameter; } - std::vector loadedData; - if (dataSize > 0) + auto blocks = std::vector>(); + blocks.resize(blockCount); + for (auto& block : blocks) { - loadedData.resize(dataSize); - if (auto len = stream.read(loadedData.data(), dataSize); len == 0) + block = std::make_unique(); + + if (auto len = stream.read(block->index); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(block->used); len == 0) + { + return ErrorCode::InvalidParameter; + } + if (auto len = stream.read(block->data.data(), block->used); len == 0) { return ErrorCode::InvalidParameter; } } + // Swap state. _entries = std::move(loadedEntries); - _data = std::move(loadedData); + _hashBuckets = std::move(hashBuckets); + _blocks = std::move(blocks); + _nextFreeId = nextFreeEntry; + + // Compute free count. + _numFree = 0; + for (auto freeId = _nextFreeId; freeId != Id::Invalid; + freeId = _entries[static_cast(freeId)].nextFreeId) + { + _numFree++; + } return ErrorCode::None; } @@ -241,7 +407,13 @@ namespace zasm { return nullptr; } - if (self._entries[idx].refCount <= 0) + + const auto refCount = self._entries[idx].refCount; +#ifndef IN_TESTS + assert(refCount > 0); +#endif + + if (refCount <= 0) { return nullptr; } @@ -249,31 +421,118 @@ namespace zasm return &self._entries[idx]; } - Id find_(const char* buf, std::size_t len, std::size_t hash) const noexcept + Id find_(const char* buf, std::size_t len, std::uint64_t hash) const noexcept { - auto it = std::find_if(std::begin(_entries), std::end(_entries), [&](const auto& entry) noexcept { - if (entry.refCount == 0 || entry.hash != hash || entry.len != len) + const auto bucketIndex = hash % kMaxHashBuckets; + const auto& bucket = _hashBuckets[bucketIndex]; + + // Find the lowest hash. + auto itLowest = std::lower_bound(bucket.begin(), bucket.end(), Id::Invalid, [&](Id id, Id) { + return _entries[static_cast(id)].hash < hash; + }); + + // Iterate all matching hashes and compare the string. + for (auto it = itLowest; it != bucket.end(); ++it) + { + const auto id = *it; + const auto& entry = _entries[static_cast(id)]; + if (entry.hash != hash) { - return false; + break; } - const char* str = _data.data() + entry.offset; - return memcmp(buf, str, len) == 0; - }); - if (it != std::end(_entries)) + + if (entry.len != len) + { + continue; + } + + const auto* str = _blocks[entry.blockIndex]->data.data() + entry.offsetInBlock; + if (std::memcmp(str, buf, len) == 0) + { + return id; + } + } + + return Id::Invalid; + } + + Block& getBlock(std::size_t len) + { + // If there are no blocks create a new one. + if (_blocks.empty()) + { + _blocks.emplace_back(std::make_unique()); + return *_blocks.back(); + } + + // See if the last block has enough space. + auto& lastBlock = *_blocks.back(); + if (lastBlock.used + len < kBlockSize) { - const auto index = std::distance(_entries.begin(), it); - return static_cast(index); + return lastBlock; } + + // Create a new block. + auto newBlock = std::make_unique(); + newBlock->index = static_cast(_blocks.size()); + _blocks.emplace_back(std::move(newBlock)); + + return *_blocks.back(); + } + + Id getFreeEntry(std::size_t requiredLength) + { + if (_nextFreeId == Id::Invalid) + { + return Id::Invalid; + } + + auto nextFreeId = _nextFreeId; + auto parentId = Id::Invalid; + + while (nextFreeId != Id::Invalid) + { + const auto& entry = _entries[static_cast(nextFreeId)]; + if (entry.capacity >= requiredLength) + { + if (parentId != Id::Invalid) + { + // Update the next free id of the parent. + auto& parentEntry = _entries[static_cast(parentId)]; + parentEntry.nextFreeId = entry.nextFreeId; + } + else + { + // Update the head of the free list. + _nextFreeId = entry.nextFreeId; + } + _numFree--; + return nextFreeId; + } + parentId = nextFreeId; + nextFreeId = entry.nextFreeId; + } + return Id::Invalid; } - Id aquire_(const char* buf, std::size_t len) + Id aquire_(const char* inputStr, std::size_t len) { - constexpr std::int32_t kTerminatorLength = 1; + const auto hash = getHash(inputStr, len); + return aquire_(inputStr, len, hash); + } - const auto hash = getHash(buf, len); + Id aquire_(const char* inputStr, std::size_t len, std::uint64_t hash) + { + // Strings can not be larger than kMaxStringSize. + if (len >= kMaxStringSize) + { + assert(len < kMaxStringSize); + return Id::Invalid; + } - auto stringId = find_(buf, len, hash); + // Find existing string and increase ref count. + auto stringId = find_(inputStr, len, hash); if (stringId != Id::Invalid) { auto& entry = _entries[static_cast(stringId)]; @@ -281,65 +540,93 @@ namespace zasm return stringId; } - const auto len2 = static_cast(len); + const auto actualLength = static_cast(len); + const auto requiredLength = static_cast(len) + 1; - // Use empty entry if any exist. - auto itEntry = std::find_if(_entries.begin(), _entries.end(), [&](auto&& entry) { - return entry.refCount <= 0 && entry.capacity >= len2 + kTerminatorLength; - }); + const auto insertToHashBucket = [&](Entry& entry, Id id) { + const auto bucketIndex = entry.hash % kMaxHashBuckets; + auto& bucket = _hashBuckets[bucketIndex]; - if (itEntry != _entries.end()) - { - // Found empty space. - auto& entry = *itEntry; + auto sortedIt = std::lower_bound(bucket.begin(), bucket.end(), id, [&](Id id, Id id2) { + return _entries[static_cast(id)].hash < _entries[static_cast(id2)].hash; + }); - stringId = static_cast(std::distance(_entries.begin(), itEntry)); - std::memcpy(_data.data() + entry.offset, buf, len2); + bucket.insert(sortedIt, id); + }; + const auto writeStringToBlock = [&](std::size_t blockOffset, Block& block) { + std::memcpy(block.data.data() + blockOffset, inputStr, actualLength); // Ensure null termination. - const auto termOffset = static_cast(entry.offset) + len2; - _data[termOffset] = '\0'; + block.data[blockOffset + actualLength] = '\0'; + }; + + // Use empty entry if any exist. + stringId = getFreeEntry(requiredLength); + if (stringId != Id::Invalid) + { + // Found empty space. + auto& entry = _entries[static_cast(stringId)]; + assert(entry.refCount == 0); + + auto& block = _blocks[entry.blockIndex]; + writeStringToBlock(entry.offsetInBlock, *block); entry.hash = hash; - entry.len = len2; + entry.len = actualLength; entry.refCount = 1; + entry.nextFreeId = Id::Invalid; + + insertToHashBucket(entry, stringId); return stringId; } // New entry. - const auto offset = static_cast(_data.size()); - std::copy_n(buf, len, std::back_insert_iterator(_data)); - // Ensure null termination. - _data.push_back('\0'); + // We align the capacity to 8 bytes. + const auto capacity = std::max(kMinStringCapacity, (requiredLength + 7) & ~7); + + auto& block = getBlock(capacity); + const auto offset = static_cast(block.used); + + writeStringToBlock(offset, block); + block.used += capacity; stringId = static_cast(_entries.size()); - _entries.push_back({ hash, offset, len2, len2 + kTerminatorLength, 1 }); + auto& entry = _entries.emplace_back(); + entry.hash = hash; + entry.blockIndex = block.index; + entry.offsetInBlock = offset; + entry.len = actualLength; + entry.capacity = capacity; + entry.refCount = 1; + + insertToHashBucket(entry, stringId); return stringId; } - static constexpr std::size_t getHash(const char* buf, size_t len) noexcept + static constexpr std::uint64_t getHash(const char* buf, size_t len) noexcept { - if (buf == nullptr) + assert(buf != nullptr); + assert(len > 0); + + if (buf == nullptr || len == 0) { return 0; } -#ifdef _M_AMD64 - constexpr std::size_t offset = 0xcbf29ce484222325ULL; - constexpr std::size_t prime = 0x00000100000001B3ULL; -#else - constexpr std::size_t offset = 0x811c9dc5U; - constexpr std::size_t prime = 0x01000193U; -#endif - std::size_t result = offset; + + constexpr std::uint64_t offset = 0xcbf29ce484222325ULL; + constexpr std::uint64_t prime = 0x00000100000001B3ULL; + + std::uint64_t result = offset; for (std::size_t i = 0; i < len; ++i) { result ^= static_cast(buf[i]); result *= prime; } + return result; } }; diff --git a/zasm/src/zasm/src/program/program.cpp b/zasm/src/zasm/src/program/program.cpp index ddcb651..ce5685c 100644 --- a/zasm/src/zasm/src/program/program.cpp +++ b/zasm/src/zasm/src/program/program.cpp @@ -154,7 +154,7 @@ namespace zasm if (name != nullptr) { - entry.nameId = _state->symbolNames.aquire(name); + entry.nameId = _state->symbolNames.acquire(name); } } @@ -594,7 +594,7 @@ namespace zasm { return StringPool::Id::Invalid; } - return state.symbolNames.aquire(str); + return state.symbolNames.acquire(str); } static Label createLabel_(detail::ProgramState& state, StringPool::Id nameId, StringPool::Id modId, LabelFlags flags) @@ -731,7 +731,7 @@ namespace zasm if (name != nullptr) { - entry.nameId = _state->symbolNames.aquire(name); + entry.nameId = _state->symbolNames.acquire(name); } return Section{ sectId }; @@ -800,7 +800,7 @@ namespace zasm entry->nameId = StringPool::Id::Invalid; } - entry->nameId = _state->symbolNames.aquire(name); + entry->nameId = _state->symbolNames.acquire(name); return ErrorCode::None; } diff --git a/zasm/src/zasm/src/serialization/serializer.cpp b/zasm/src/zasm/src/serialization/serializer.cpp index 05e57aa..548ceb0 100644 --- a/zasm/src/zasm/src/serialization/serializer.cpp +++ b/zasm/src/zasm/src/serialization/serializer.cpp @@ -475,7 +475,7 @@ namespace zasm defaultSect.attribs = Section::kDefaultAttribs; defaultSect.align = Section::kDefaultAlign; defaultSect.address = newBase; - defaultSect.nameId = programState.symbolNames.aquire(".text"); + defaultSect.nameId = programState.symbolNames.acquire(".text"); const auto serializePass = [&]() -> Error { state.buffer.clear();