Issue a diagnostic if we try to parse a source file that is too large. (

#4429) Previously in an optimized build we'd produce bogus tokens, such as tokens with incorrect IdentifierIds, and in a debug build we would try to CHECK-fail -- but actually wouldn't, because we're incorrectly checking for `2 << bits` instead of `1 << bits`. I hit this while I was trying to do some profiling and was seeing some very strange diagnostics. The diagnostic is pointed at the first token that is beyond the limit to help people determine where to split their files. --------- Co-authored-by: Jon Ross-Perkins <[email protected]>
carbon-language · Oct 22, 2024 · e68e54d · e68e54d
1 parent af816cd
commit e68e54d
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 7 deletions.
diff --git a/toolchain/diagnostics/diagnostic_kind.def b/toolchain/diagnostics/diagnostic_kind.def
@@ -41,6 +41,7 @@ CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString)
 CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
 CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
 CARBON_DIAGNOSTIC_KIND(TooManyDigits)
+CARBON_DIAGNOSTIC_KIND(TooManyTokens)
 CARBON_DIAGNOSTIC_KIND(TrailingComment)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)

diff --git a/toolchain/diagnostics/emitted_diagnostics_test.cpp b/toolchain/diagnostics/emitted_diagnostics_test.cpp
@@ -60,6 +60,10 @@ static auto IsUntestedDiagnostic(DiagnosticKind diagnostic_kind) -> bool {
       // loss in merge conflicts due to the amount of tests being changed right
       // now.
       return true;
+    case DiagnosticKind::TooManyTokens:
+      // This isn't feasible to test with a normal testcase, but is tested in
+      // lex/tokenized_buffer_test.cpp.
+      return true;
     default:
       return false;
   }

diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp
@@ -191,6 +191,11 @@ class [[clang::internal_linkage]] Lexer {
 
   auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void;
 
+  // Perform final checking and cleanup that should be done once we have
+  // finished lexing the whole file, and before we consider the tokenized buffer
+  // to be complete.
+  auto Finalize() -> void;
+
   auto DiagnoseAndFixMismatchedBrackets() -> void;
 
   // The main entry point for dispatching through the lexer's table. This method
@@ -729,6 +734,8 @@ auto Lexer::Lex() && -> TokenizedBuffer {
   // dispatch table until everything from source_text is consumed.
   DispatchNext(*this, source_text, position);
 
+  Finalize();
+
   if (consumer_.seen_error()) {
     buffer_.has_errors_ = true;
   }
@@ -1342,11 +1349,31 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
   NoteWhitespace();
 
   LexToken(TokenKind::FileEnd, position);
+}
 
+auto Lexer::Finalize() -> void {
   // If we had any mismatched brackets, issue diagnostics and fix them.
   if (has_mismatched_brackets_ || !open_groups_.empty()) {
     DiagnoseAndFixMismatchedBrackets();
   }
+
+  // Reject source files with so many tokens that we may have exceeded the
+  // number of bits in `token_payload_`.
+  //
+  // Note that we rely on this check also catching the case where there are too
+  // many identifiers to fit an `IdentifierId` into a `token_payload_`, and
+  // likewise for `IntId` and so on. If we start adding any of those IDs prior
+  // to lexing, we may need to also limit the number of those IDs here.
+  if (buffer_.token_infos_.size() > TokenizedBuffer::MaxTokens) {
+    CARBON_DIAGNOSTIC(TooManyTokens, Error,
+                      "too many tokens in source file; try splitting into "
+                      "multiple source files");
+    // Subtract one to leave room for the `FileEnd` token.
+    token_emitter_.Emit(TokenIndex(TokenizedBuffer::MaxTokens - 1),
+                        TooManyTokens);
+    // TODO: Convert tokens after the token limit to error tokens to avoid
+    // misinterpretation by consumers of the tokenized buffer.
+  }
 }
 
 // A list of pending insertions to make into a tokenized buffer for error

diff --git a/toolchain/lex/tokenized_buffer.h b/toolchain/lex/tokenized_buffer.h
@@ -83,6 +83,10 @@ class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
 // `HasError` returning true.
 class TokenizedBuffer : public Printable<TokenizedBuffer> {
  public:
+  // The maximum number of tokens that can be stored in the buffer, including
+  // the FileStart and FileEnd tokens.
+  static constexpr int MaxTokens = 1 << 23;
+
   // A comment, which can be a block of lines.
   //
   // This is the API version of `CommentData`.
@@ -306,7 +310,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     }
     auto set_ident_id(IdentifierId ident_id) -> void {
       CARBON_DCHECK(kind() == TokenKind::Identifier);
-      CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
       token_payload_ = ident_id.index;
     }
 
@@ -334,7 +337,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     }
     auto set_closing_token_index(TokenIndex closing_index) -> void {
       CARBON_DCHECK(kind().is_opening_symbol());
-      CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
       token_payload_ = closing_index.index;
     }
 
@@ -344,7 +346,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     }
     auto set_opening_token_index(TokenIndex opening_index) -> void {
       CARBON_DCHECK(kind().is_closing_symbol());
-      CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
       token_payload_ = opening_index.index;
     }
 
@@ -395,18 +396,23 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
         : kind_(kind),
           has_leading_space_(has_leading_space),
           token_payload_(payload),
-          byte_offset_(byte_offset) {
-      CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
-                    "Payload won't fit into unsigned bit pack: {0}", payload);
-    }
+          byte_offset_(byte_offset) {}
 
     // A bitfield that encodes the token's kind, the leading space flag, and the
     // remaining bits in a payload. These are encoded together as a bitfield for
     // density and because these are the hottest fields of tokens for consumers
     // after lexing.
+    //
+    // Payload values are typically ID types for which we create at most one per
+    // token, so we ensure that `token_payload_` is large enough to fit any
+    // token index. Stores to this field may overflow, but we produce an error
+    // in `Lexer::Finalize` if the file has more than `MaxTokens` tokens, so
+    // this value never overflows if lexing succeeds.
     TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
     bool has_leading_space_ : 1;
     unsigned token_payload_ : PayloadBits;
+    static_assert(MaxTokens <= 1 << PayloadBits,
+                  "Not enough payload bits to store a token index");
 
     // Separate storage for the byte offset, this is hot while lexing but then
     // generally cold.

diff --git a/toolchain/lex/tokenized_buffer_test.cpp b/toolchain/lex/tokenized_buffer_test.cpp
@@ -1107,6 +1107,21 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
   compile_helper_.GetTokenizedBuffer("\b", &consumer);
 }
 
+TEST_F(LexerTest, DiagnosticFileTooLarge) {
+  Testing::MockDiagnosticConsumer consumer;
+  static constexpr size_t NumLines = 10'000'000;
+  std::string input;
+  input.reserve(NumLines * 3);
+  for ([[maybe_unused]] int _ : llvm::seq(NumLines)) {
+    input += "{}\n";
+  }
+  EXPECT_CALL(consumer,
+              HandleDiagnostic(IsSingleDiagnostic(
+                  DiagnosticKind::TooManyTokens, DiagnosticLevel::Error,
+                  TokenizedBuffer::MaxTokens / 2, 1, _)));
+  compile_helper_.GetTokenizedBuffer(input, &consumer);
+}
+
 // Appends comment lines to the string, to create a comment block.
 static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
     -> void {