Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue a diagnostic if we try to parse a source file that is too large. #4429

Merged
merged 11 commits into from
Oct 22, 2024
1 change: 1 addition & 0 deletions toolchain/diagnostics/diagnostic_kind.def
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString)
CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
CARBON_DIAGNOSTIC_KIND(TooManyDigits)
CARBON_DIAGNOSTIC_KIND(TooManyTokens)
CARBON_DIAGNOSTIC_KIND(TrailingComment)
CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits)
CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)
Expand Down
4 changes: 4 additions & 0 deletions toolchain/diagnostics/emitted_diagnostics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ static auto IsUntestedDiagnostic(DiagnosticKind diagnostic_kind) -> bool {
// loss in merge conflicts due to the amount of tests being changed right
// now.
return true;
case DiagnosticKind::TooManyTokens:
// This isn't feasible to test with a normal testcase, but is tested in
// lex/tokenized_buffer_test.cpp.
return true;
default:
return false;
}
Expand Down
27 changes: 27 additions & 0 deletions toolchain/lex/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ class [[clang::internal_linkage]] Lexer {

auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void;

// Perform final checking and cleanup that should be done once we have
// finished lexing the whole file, and before we consider the tokenized buffer
// to be complete.
auto Finalize() -> void;

auto DiagnoseAndFixMismatchedBrackets() -> void;

// The main entry point for dispatching through the lexer's table. This method
Expand Down Expand Up @@ -729,6 +734,8 @@ auto Lexer::Lex() && -> TokenizedBuffer {
// dispatch table until everything from source_text is consumed.
DispatchNext(*this, source_text, position);

Finalize();

if (consumer_.seen_error()) {
buffer_.has_errors_ = true;
}
Expand Down Expand Up @@ -1342,11 +1349,31 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
NoteWhitespace();

LexToken(TokenKind::FileEnd, position);
}

auto Lexer::Finalize() -> void {
// If we had any mismatched brackets, issue diagnostics and fix them.
if (has_mismatched_brackets_ || !open_groups_.empty()) {
DiagnoseAndFixMismatchedBrackets();
}

// Reject source files with so many tokens that we may have exceeded the
// number of bits in `token_payload_`.
//
// Note that we rely on this check also catching the case where there are too
// many identifiers to fit an `IdentifierId` into a `token_payload_`, and
// likewise for `IntId` and so on. If we start adding any of those IDs prior
// to lexing, we may need to also limit the number of those IDs here.
if (buffer_.token_infos_.size() > TokenizedBuffer::MaxTokens) {
CARBON_DIAGNOSTIC(TooManyTokens, Error,
"too many tokens in source file; try splitting into "
"multiple source files");
// Subtract one to leave room for the `FileEnd` token.
token_emitter_.Emit(TokenIndex(TokenizedBuffer::MaxTokens - 1),
TooManyTokens);
// TODO: Convert tokens after the token limit to error tokens to avoid
// misinterpretation by consumers of the tokenized buffer.
}
}

// A list of pending insertions to make into a tokenized buffer for error
Expand Down
20 changes: 13 additions & 7 deletions toolchain/lex/tokenized_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
// `HasError` returning true.
class TokenizedBuffer : public Printable<TokenizedBuffer> {
public:
// The maximum number of tokens that can be stored in the buffer, including
// the FileStart and FileEnd tokens.
static constexpr int MaxTokens = 1 << 23;

// A comment, which can be a block of lines.
//
// This is the API version of `CommentData`.
Expand Down Expand Up @@ -306,7 +310,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
}
auto set_ident_id(IdentifierId ident_id) -> void {
CARBON_DCHECK(kind() == TokenKind::Identifier);
CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
token_payload_ = ident_id.index;
}

Expand Down Expand Up @@ -334,7 +337,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
}
auto set_closing_token_index(TokenIndex closing_index) -> void {
CARBON_DCHECK(kind().is_opening_symbol());
CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
token_payload_ = closing_index.index;
}

Expand All @@ -344,7 +346,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
}
auto set_opening_token_index(TokenIndex opening_index) -> void {
CARBON_DCHECK(kind().is_closing_symbol());
CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
token_payload_ = opening_index.index;
}

Expand Down Expand Up @@ -395,18 +396,23 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
: kind_(kind),
has_leading_space_(has_leading_space),
token_payload_(payload),
byte_offset_(byte_offset) {
CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
"Payload won't fit into unsigned bit pack: {0}", payload);
}
byte_offset_(byte_offset) {}

// A bitfield that encodes the token's kind, the leading space flag, and the
// remaining bits in a payload. These are encoded together as a bitfield for
// density and because these are the hottest fields of tokens for consumers
// after lexing.
//
// Payload values are typically ID types for which we create at most one per
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Payload values are typically ID types for which we create at most one per
// Payload values are typically ID types for which we create at most one per

Whitespace suggestion due to the long comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes the previous comment that's trying to talk about all three bit-fields less clear.

What do you think about moving this comment before all the bit-fields, with the other one?

// token, so we ensure that `token_payload_` is large enough to fit any
// token index. Stores to this field may overflow, but we produce an error
// in `Lexer::Finalize` if the file has more than `MaxTokens` tokens, so
// this value never overflows if lexing succeeds.
TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
bool has_leading_space_ : 1;
unsigned token_payload_ : PayloadBits;
static_assert(MaxTokens <= 1 << PayloadBits,
"Not enough payload bits to store a token index");
Comment on lines +414 to +415
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What made you choose this approach versus relocating PayloadBits to make MaxTokens directly calculated based on it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted an explicit value for MaxTokens so that anyone changing that value knows they're changing the token limit, not just some representation detail of the tokenized buffer.


// Separate storage for the byte offset, this is hot while lexing but then
// generally cold.
Expand Down
15 changes: 15 additions & 0 deletions toolchain/lex/tokenized_buffer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,6 +1107,21 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
compile_helper_.GetTokenizedBuffer("\b", &consumer);
}

TEST_F(LexerTest, DiagnosticFileTooLarge) {
Testing::MockDiagnosticConsumer consumer;
static constexpr size_t NumLines = 10'000'000;
std::string input;
input.reserve(NumLines * 3);
for ([[maybe_unused]] int _ : llvm::seq(NumLines)) {
input += "{}\n";
}
EXPECT_CALL(consumer,
HandleDiagnostic(IsSingleDiagnostic(
DiagnosticKind::TooManyTokens, DiagnosticLevel::Error,
TokenizedBuffer::MaxTokens / 2, 1, _)));
compile_helper_.GetTokenizedBuffer(input, &consumer);
}

// Appends comment lines to the string, to create a comment block.
static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
-> void {
Expand Down
Loading