Skip to content

Commit

Permalink
Issue a diagnostic if we try to parse a source file that is too large. (
Browse files Browse the repository at this point in the history
#4429)

Previously in an optimized build we'd produce bogus tokens, such as
tokens with incorrect IdentifierIds, and in a debug build we would try
to CHECK-fail -- but actually wouldn't, because we're incorrectly
checking for `2 << bits` instead of `1 << bits`. I hit this while I was
trying to do some profiling and was seeing some very strange
diagnostics.

The diagnostic is pointed at the first token that is beyond the limit to
help people determine where to split their files.

---------

Co-authored-by: Jon Ross-Perkins <[email protected]>
  • Loading branch information
zygoloid and jonmeow authored Oct 22, 2024
1 parent af816cd commit e68e54d
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 7 deletions.
1 change: 1 addition & 0 deletions toolchain/diagnostics/diagnostic_kind.def
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString)
CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
CARBON_DIAGNOSTIC_KIND(TooManyDigits)
CARBON_DIAGNOSTIC_KIND(TooManyTokens)
CARBON_DIAGNOSTIC_KIND(TrailingComment)
CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits)
CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)
Expand Down
4 changes: 4 additions & 0 deletions toolchain/diagnostics/emitted_diagnostics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ static auto IsUntestedDiagnostic(DiagnosticKind diagnostic_kind) -> bool {
// loss in merge conflicts due to the amount of tests being changed right
// now.
return true;
case DiagnosticKind::TooManyTokens:
// This isn't feasible to test with a normal testcase, but is tested in
// lex/tokenized_buffer_test.cpp.
return true;
default:
return false;
}
Expand Down
27 changes: 27 additions & 0 deletions toolchain/lex/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ class [[clang::internal_linkage]] Lexer {

auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void;

// Perform final checking and cleanup that should be done once we have
// finished lexing the whole file, and before we consider the tokenized buffer
// to be complete.
auto Finalize() -> void;

auto DiagnoseAndFixMismatchedBrackets() -> void;

// The main entry point for dispatching through the lexer's table. This method
Expand Down Expand Up @@ -729,6 +734,8 @@ auto Lexer::Lex() && -> TokenizedBuffer {
// dispatch table until everything from source_text is consumed.
DispatchNext(*this, source_text, position);

Finalize();

if (consumer_.seen_error()) {
buffer_.has_errors_ = true;
}
Expand Down Expand Up @@ -1342,11 +1349,31 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
NoteWhitespace();

LexToken(TokenKind::FileEnd, position);
}

auto Lexer::Finalize() -> void {
// If we had any mismatched brackets, issue diagnostics and fix them.
if (has_mismatched_brackets_ || !open_groups_.empty()) {
DiagnoseAndFixMismatchedBrackets();
}

// Reject source files with so many tokens that we may have exceeded the
// number of bits in `token_payload_`.
//
// Note that we rely on this check also catching the case where there are too
// many identifiers to fit an `IdentifierId` into a `token_payload_`, and
// likewise for `IntId` and so on. If we start adding any of those IDs prior
// to lexing, we may need to also limit the number of those IDs here.
if (buffer_.token_infos_.size() > TokenizedBuffer::MaxTokens) {
CARBON_DIAGNOSTIC(TooManyTokens, Error,
"too many tokens in source file; try splitting into "
"multiple source files");
// Subtract one to leave room for the `FileEnd` token.
token_emitter_.Emit(TokenIndex(TokenizedBuffer::MaxTokens - 1),
TooManyTokens);
// TODO: Convert tokens after the token limit to error tokens to avoid
// misinterpretation by consumers of the tokenized buffer.
}
}

// A list of pending insertions to make into a tokenized buffer for error
Expand Down
20 changes: 13 additions & 7 deletions toolchain/lex/tokenized_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
// `HasError` returning true.
class TokenizedBuffer : public Printable<TokenizedBuffer> {
public:
// The maximum number of tokens that can be stored in the buffer, including
// the FileStart and FileEnd tokens.
static constexpr int MaxTokens = 1 << 23;

// A comment, which can be a block of lines.
//
// This is the API version of `CommentData`.
Expand Down Expand Up @@ -306,7 +310,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
}
auto set_ident_id(IdentifierId ident_id) -> void {
CARBON_DCHECK(kind() == TokenKind::Identifier);
CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
token_payload_ = ident_id.index;
}

Expand Down Expand Up @@ -334,7 +337,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
}
auto set_closing_token_index(TokenIndex closing_index) -> void {
CARBON_DCHECK(kind().is_opening_symbol());
CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
token_payload_ = closing_index.index;
}

Expand All @@ -344,7 +346,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
}
auto set_opening_token_index(TokenIndex opening_index) -> void {
CARBON_DCHECK(kind().is_closing_symbol());
CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
token_payload_ = opening_index.index;
}

Expand Down Expand Up @@ -395,18 +396,23 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
: kind_(kind),
has_leading_space_(has_leading_space),
token_payload_(payload),
byte_offset_(byte_offset) {
CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
"Payload won't fit into unsigned bit pack: {0}", payload);
}
byte_offset_(byte_offset) {}

// A bitfield that encodes the token's kind, the leading space flag, and the
// remaining bits in a payload. These are encoded together as a bitfield for
// density and because these are the hottest fields of tokens for consumers
// after lexing.
//
// Payload values are typically ID types for which we create at most one per
// token, so we ensure that `token_payload_` is large enough to fit any
// token index. Stores to this field may overflow, but we produce an error
// in `Lexer::Finalize` if the file has more than `MaxTokens` tokens, so
// this value never overflows if lexing succeeds.
TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
bool has_leading_space_ : 1;
unsigned token_payload_ : PayloadBits;
static_assert(MaxTokens <= 1 << PayloadBits,
"Not enough payload bits to store a token index");

// Separate storage for the byte offset, this is hot while lexing but then
// generally cold.
Expand Down
15 changes: 15 additions & 0 deletions toolchain/lex/tokenized_buffer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,6 +1107,21 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
compile_helper_.GetTokenizedBuffer("\b", &consumer);
}

TEST_F(LexerTest, DiagnosticFileTooLarge) {
Testing::MockDiagnosticConsumer consumer;
static constexpr size_t NumLines = 10'000'000;
std::string input;
input.reserve(NumLines * 3);
for ([[maybe_unused]] int _ : llvm::seq(NumLines)) {
input += "{}\n";
}
EXPECT_CALL(consumer,
HandleDiagnostic(IsSingleDiagnostic(
DiagnosticKind::TooManyTokens, DiagnosticLevel::Error,
TokenizedBuffer::MaxTokens / 2, 1, _)));
compile_helper_.GetTokenizedBuffer(input, &consumer);
}

// Appends comment lines to the string, to create a comment block.
static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
-> void {
Expand Down

0 comments on commit e68e54d

Please sign in to comment.