Skip to content

Commit

Permalink
Canonicalize away bit width and embed small integers into IntIds
Browse files Browse the repository at this point in the history
The first change here is to canonicalize away bit width when tracking
integers in our shared value store. This lets us have a more definitive
model of "what is the mathematical value". It also frees us to use more
efficient bit widths when available, such as bits inside the ID itself.

For canonicalizing, we try to minimize the width adjustments and
maximize the use of the SSO in APInt, and so we never shrink belowe
64-bits and grow in multiples of the word bit width in the
implementation. We also canonicalize to the signed 2s compliment
representation so we can represent negative numbers in an intuitive way.

The canonicalizing requires getting the bit width out of the type and
adjusting to it within the toolchain when doing any kind of math, and
this PR updates various places to do that, as well as adding some
convenience APIs to assist.

Then we take advantage of the canonical form and embed small integers
into the ID itself rather than allocating storage for them and
referencing them with an index. This is especially helpful for the
pervasive small integers such as the sizes of types, arrays, etc. Those
no longer require indirection at all. Various short-cut APIs to take
advantage of this have also been added.

This PR improves lexing by about 5% when there are lots of `i32` types.
  • Loading branch information
chandlerc committed Nov 6, 2024
1 parent a68acb1 commit 6d73339
Show file tree
Hide file tree
Showing 20 changed files with 713 additions and 86 deletions.
33 changes: 33 additions & 0 deletions toolchain/base/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ cc_library(
hdrs = ["value_ids.h"],
deps = [
":index_base",
"//common:check",
"//common:ostream",
"@llvm-project//llvm:Support",
],
Expand Down Expand Up @@ -80,10 +81,42 @@ cc_test(
],
)

cc_library(
name = "int_store",
srcs = ["int_store.cpp"],
hdrs = ["int_store.h"],
deps = [
":mem_usage",
":value_ids",
":value_store",
":yaml",
"//common:check",
"//common:hashtable_key_context",
"//common:ostream",
"//common:set",
"@llvm-project//llvm:Support",
],
)

cc_test(
name = "int_store_test",
size = "small",
srcs = ["int_store_test.cpp"],
deps = [
":int_store",
":value_ids",
"//testing/base:gtest_main",
"//testing/base:test_raw_ostream",
"//toolchain/testing:yaml_test_helpers",
"@googletest//:gtest",
],
)

cc_library(
name = "shared_value_stores",
hdrs = ["shared_value_stores.h"],
deps = [
":int_store",
":mem_usage",
":value_ids",
":value_store",
Expand Down
60 changes: 60 additions & 0 deletions toolchain/base/int_store.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "toolchain/base/int_store.h"

namespace Carbon {

auto IntStore::CanonicalBitWidth(int significant_bits) -> int {
// For larger integers, we store them in as a signed APInt with a canonical
// width that is the smallest multiple of the word type's bits, but no
// smaller than a minimum of 64 bits to avoid spurious resizing of the most
// common cases (<= 64 bits).
static constexpr int WordWidth = llvm::APInt::APINT_BITS_PER_WORD;

return std::max<int>(
MinAPWidth, ((significant_bits + WordWidth - 1) / WordWidth) * WordWidth);
}

auto IntStore::CanonicalizeSigned(llvm::APInt value) -> llvm::APInt {
return value.sextOrTrunc(CanonicalBitWidth(value.getSignificantBits()));
}

auto IntStore::CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt {
// We need the width to include a zero sign bit as we canonicalize to a
// signed representation.
return value.zextOrTrunc(CanonicalBitWidth(value.getActiveBits() + 1));
}

auto IntStore::AddLarge(int64_t value) -> IntId {
auto ap_id =
values_.Add(llvm::APInt(CanonicalBitWidth(64), value, /*isSigned=*/true));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::AddSignedLarge(llvm::APInt value) -> IntId {
auto ap_id = values_.Add(CanonicalizeSigned(value));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::AddUnsignedLarge(llvm::APInt value) -> IntId {
auto ap_id = values_.Add(CanonicalizeUnsigned(value));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::LookupSignedLarge(llvm::APInt value) const -> IntId {
auto ap_id = values_.Lookup(CanonicalizeSigned(value));
return IntId::MakeIndexOrInvalid(ap_id.index);
}

auto IntStore::OutputYaml() const -> Yaml::OutputMapping {
return values_.OutputYaml();
}

auto IntStore::CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
-> void {
values_.CollectMemUsage(mem_usage, label);
}

} // namespace Carbon
186 changes: 186 additions & 0 deletions toolchain/base/int_store.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef CARBON_TOOLCHAIN_BASE_INT_STORE_H_
#define CARBON_TOOLCHAIN_BASE_INT_STORE_H_

#include "common/check.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "toolchain/base/mem_usage.h"
#include "toolchain/base/value_ids.h"
#include "toolchain/base/value_store.h"
#include "toolchain/base/yaml.h"

namespace Carbon {

// Forward declare a testing peer so we can friend it.
namespace Testing {
struct IntStoreTestPeer;
} // namespace Testing

// A canonicalizing value store with deep optimizations for integers.
//
// This stores integers as abstract, signed mathematical integers. The bit width
// of specific `APInt` values, either as inputs or outputs, is disregarded for
// the purpose of canonicalization and the returned integer may use a very
// different bit width `APInt` than was used when adding. There are also
// optimized paths for adding integer values representable using native integer
// types.
//
// Because the integers in the store are canonicalized without a specific bit
// width there are helper functions to coerce them to a specific desired bit
// width for use.
//
// This leverages a significant optimization for small integer values -- rather
// than canonicalizing and making unique them in a `ValueStore`, they are
// directly embedded in the `IntId` itself. Only larger integers are store in an
// array of `APInt` values and represented as an index in the ID.
class IntStore {
public:
// Adds an integer value representable in a host `int64_t` to the store.
// Especially useful when the integer is computed without an `APInt` in the
// first place.
//
// This only accepts a signed `int64_t` and uses the mathematical signed
// integer value of it as the added integer value.
//
// Returns the ID corresponding to this integer value, storing an `APInt` if
// necessary to represent it.
auto Add(int64_t value) -> IntId {
// First try directly making this into an ID.
if (IntId id = IntId::TryMakeValue(value); id.is_valid()) [[likely]] {
return id;
}

// Fallback for larger values.
return AddLarge(value);
}

// Stores a canonical copy of a signed value and returns its ID.
auto AddSigned(llvm::APInt value) -> IntId {
// First try directly making this into an ID.
if (IntId id = IntId::TryMakeSignedValue(value); id.is_valid()) [[likely]] {
return id;
}

// Fallback for larger values.
return AddSignedLarge(std::move(value));
}

// Stores a canonical copy of an unsigned value and returns its ID.
auto AddUnsigned(llvm::APInt value) -> IntId {
// First try directly making this into an ID.
if (IntId id = IntId::TryMakeUnsignedValue(value); id.is_valid())
[[likely]] {
return id;
}

// Fallback for larger values.
return AddUnsignedLarge(std::move(value));
}

// Returns the value for an ID.
//
// This will always be a signed `APInt` with a canonical bit width for the
// specific integer value in question.
auto Get(IntId id) const -> llvm::APInt {
if (id.is_value()) [[likely]] {
return llvm::APInt(MinAPWidth, id.AsValue(), /*isSigned=*/true);
}
return values_.Get(APIntId(id.AsIndex()));
}

// Returns the value for an ID adjusted to a specific bit width.
//
// Note that because we store canonical mathematical integers as signed
// integers, this always sign extends or truncates to the target width. The
// caller can then use that as a signed or unsigned integer as needed.
auto GetAtWidth(IntId id, int bit_width) const -> llvm::APInt {
llvm::APInt value = Get(id);
if (static_cast<int>(value.getBitWidth()) != bit_width) {
value = value.sextOrTrunc(bit_width);
}
return value;
}

// Returns the value for an ID adjusted to the bit width specified with
// another integer ID.
//
// This simply looks up the width integer ID, and then calls the above
// `GetAtWidth` overload using the value found for it. See that overload for
// more details.
auto GetAtWidth(IntId id, IntId bit_width_id) const -> llvm::APInt {
const llvm::APInt& bit_width = Get(bit_width_id);
CARBON_CHECK(bit_width.isStrictlyPositive() &&
bit_width.isSignedIntN(sizeof(int) * 8),
"Invalid bit width value: {0}", bit_width);
return GetAtWidth(id, bit_width.getSExtValue());
}

// Looks up the canonical ID for a value, or returns invalid if not in the
// store.
auto LookupSigned(llvm::APInt value) const -> IntId {
if (IntId id = IntId::TryMakeSignedValue(value); id.is_valid()) [[likely]] {
return id;
}

// Fallback for larger values.
return LookupSignedLarge(std::move(value));
}

// Output a YAML description of this data structure. Note that this will only
// include the integers that required storing, not those successfully embedded
// into the ID space.
auto OutputYaml() const -> Yaml::OutputMapping;

auto array_ref() const -> llvm::ArrayRef<llvm::APInt> {
return values_.array_ref();
}
auto size() const -> size_t { return values_.size(); }

// Collects the memory usage of the separately stored integers.
auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
-> void;

private:
friend struct Testing::IntStoreTestPeer;

struct APIntId : IdBase, Printable<APIntId> {
using ValueType = llvm::APInt;
static const APIntId Invalid;
using IdBase::IdBase;
auto Print(llvm::raw_ostream& out) const -> void {
out << "ap-int";
IdBase::Print(out);
}
};

static constexpr int MinAPWidth = 64;

// Pick a canonical bit width for the provided number of significant bits.
static auto CanonicalBitWidth(int significant_bits) -> int;

// Canonicalize an incoming signed APInt to the correct bit width.
static auto CanonicalizeSigned(llvm::APInt value) -> llvm::APInt;

// Canonicalize an incoming unsigned APInt to the correct bit width.
static auto CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt;

auto AddLarge(int64_t value) -> IntId;
auto AddSignedLarge(llvm::APInt value) -> IntId;
auto AddUnsignedLarge(llvm::APInt value) -> IntId;

auto LookupSignedLarge(llvm::APInt value) const -> IntId;

CanonicalValueStore<APIntId> values_;
};

constexpr IntStore::APIntId IntStore::APIntId::Invalid(
IntId::Invalid.AsIndex());

} // namespace Carbon

#endif // CARBON_TOOLCHAIN_BASE_INT_STORE_H_
Loading

0 comments on commit 6d73339

Please sign in to comment.