diff --git a/toolchain/diagnostics/diagnostic_kind.def b/toolchain/diagnostics/diagnostic_kind.def index 92321735454d..115399115dfa 100644 --- a/toolchain/diagnostics/diagnostic_kind.def +++ b/toolchain/diagnostics/diagnostic_kind.def @@ -41,6 +41,7 @@ CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString) CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes) CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer) CARBON_DIAGNOSTIC_KIND(TooManyDigits) +CARBON_DIAGNOSTIC_KIND(TooManyTokens) CARBON_DIAGNOSTIC_KIND(TrailingComment) CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits) CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate) diff --git a/toolchain/diagnostics/emitted_diagnostics_test.cpp b/toolchain/diagnostics/emitted_diagnostics_test.cpp index 6209c128ecbf..4f726f0ddd24 100644 --- a/toolchain/diagnostics/emitted_diagnostics_test.cpp +++ b/toolchain/diagnostics/emitted_diagnostics_test.cpp @@ -60,6 +60,10 @@ static auto IsUntestedDiagnostic(DiagnosticKind diagnostic_kind) -> bool { // loss in merge conflicts due to the amount of tests being changed right // now. return true; + case DiagnosticKind::TooManyTokens: + // This isn't feasible to test with a normal testcase, but is tested in + // lex/tokenized_buffer_test.cpp. + return true; default: return false; } diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp index 9cb3c3175fe6..db3074194464 100644 --- a/toolchain/lex/lex.cpp +++ b/toolchain/lex/lex.cpp @@ -191,6 +191,11 @@ class [[clang::internal_linkage]] Lexer { auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void; + // Perform final checking and cleanup that should be done once we have + // finished lexing the whole file, and before we consider the tokenized buffer + // to be complete. + auto Finalize() -> void; + auto DiagnoseAndFixMismatchedBrackets() -> void; // The main entry point for dispatching through the lexer's table. This method @@ -729,6 +734,8 @@ auto Lexer::Lex() && -> TokenizedBuffer { // dispatch table until everything from source_text is consumed. DispatchNext(*this, source_text, position); + Finalize(); + if (consumer_.seen_error()) { buffer_.has_errors_ = true; } @@ -1342,11 +1349,31 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void { NoteWhitespace(); LexToken(TokenKind::FileEnd, position); +} +auto Lexer::Finalize() -> void { // If we had any mismatched brackets, issue diagnostics and fix them. if (has_mismatched_brackets_ || !open_groups_.empty()) { DiagnoseAndFixMismatchedBrackets(); } + + // Reject source files with so many tokens that we may have exceeded the + // number of bits in `token_payload_`. + // + // Note that we rely on this check also catching the case where there are too + // many identifiers to fit an `IdentifierId` into a `token_payload_`, and + // likewise for `IntId` and so on. If we start adding any of those IDs prior + // to lexing, we may need to also limit the number of those IDs here. + if (buffer_.token_infos_.size() > TokenizedBuffer::MaxTokens) { + CARBON_DIAGNOSTIC(TooManyTokens, Error, + "too many tokens in source file; try splitting into " + "multiple source files"); + // Subtract one to leave room for the `FileEnd` token. + token_emitter_.Emit(TokenIndex(TokenizedBuffer::MaxTokens - 1), + TooManyTokens); + // TODO: Convert tokens after the token limit to error tokens to avoid + // misinterpretation by consumers of the tokenized buffer. + } } // A list of pending insertions to make into a tokenized buffer for error diff --git a/toolchain/lex/tokenized_buffer.h b/toolchain/lex/tokenized_buffer.h index 30dbc9c9a432..7074f1e79256 100644 --- a/toolchain/lex/tokenized_buffer.h +++ b/toolchain/lex/tokenized_buffer.h @@ -83,6 +83,10 @@ class TokenDiagnosticConverter : public DiagnosticConverter { // `HasError` returning true. class TokenizedBuffer : public Printable { public: + // The maximum number of tokens that can be stored in the buffer, including + // the FileStart and FileEnd tokens. + static constexpr int MaxTokens = 1 << 23; + // A comment, which can be a block of lines. // // This is the API version of `CommentData`. @@ -306,7 +310,6 @@ class TokenizedBuffer : public Printable { } auto set_ident_id(IdentifierId ident_id) -> void { CARBON_DCHECK(kind() == TokenKind::Identifier); - CARBON_DCHECK(ident_id.index < (2 << PayloadBits)); token_payload_ = ident_id.index; } @@ -334,7 +337,6 @@ class TokenizedBuffer : public Printable { } auto set_closing_token_index(TokenIndex closing_index) -> void { CARBON_DCHECK(kind().is_opening_symbol()); - CARBON_DCHECK(closing_index.index < (2 << PayloadBits)); token_payload_ = closing_index.index; } @@ -344,7 +346,6 @@ class TokenizedBuffer : public Printable { } auto set_opening_token_index(TokenIndex opening_index) -> void { CARBON_DCHECK(kind().is_closing_symbol()); - CARBON_DCHECK(opening_index.index < (2 << PayloadBits)); token_payload_ = opening_index.index; } @@ -395,18 +396,23 @@ class TokenizedBuffer : public Printable { : kind_(kind), has_leading_space_(has_leading_space), token_payload_(payload), - byte_offset_(byte_offset) { - CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits), - "Payload won't fit into unsigned bit pack: {0}", payload); - } + byte_offset_(byte_offset) {} // A bitfield that encodes the token's kind, the leading space flag, and the // remaining bits in a payload. These are encoded together as a bitfield for // density and because these are the hottest fields of tokens for consumers // after lexing. + // + // Payload values are typically ID types for which we create at most one per + // token, so we ensure that `token_payload_` is large enough to fit any + // token index. Stores to this field may overflow, but we produce an error + // in `Lexer::Finalize` if the file has more than `MaxTokens` tokens, so + // this value never overflows if lexing succeeds. TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8; bool has_leading_space_ : 1; unsigned token_payload_ : PayloadBits; + static_assert(MaxTokens <= 1 << PayloadBits, + "Not enough payload bits to store a token index"); // Separate storage for the byte offset, this is hot while lexing but then // generally cold. diff --git a/toolchain/lex/tokenized_buffer_test.cpp b/toolchain/lex/tokenized_buffer_test.cpp index 8305cf8876ff..b756447cb9db 100644 --- a/toolchain/lex/tokenized_buffer_test.cpp +++ b/toolchain/lex/tokenized_buffer_test.cpp @@ -1107,6 +1107,21 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) { compile_helper_.GetTokenizedBuffer("\b", &consumer); } +TEST_F(LexerTest, DiagnosticFileTooLarge) { + Testing::MockDiagnosticConsumer consumer; + static constexpr size_t NumLines = 10'000'000; + std::string input; + input.reserve(NumLines * 3); + for ([[maybe_unused]] int _ : llvm::seq(NumLines)) { + input += "{}\n"; + } + EXPECT_CALL(consumer, + HandleDiagnostic(IsSingleDiagnostic( + DiagnosticKind::TooManyTokens, DiagnosticLevel::Error, + TokenizedBuffer::MaxTokens / 2, 1, _))); + compile_helper_.GetTokenizedBuffer(input, &consumer); +} + // Appends comment lines to the string, to create a comment block. static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag) -> void {