carbon-language · zygoloid · Oct 22, 2024 · Oct 19, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/toolchain/diagnostics/diagnostic_kind.def b/toolchain/diagnostics/diagnostic_kind.def
@@ -41,6 +41,7 @@ CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString)
 CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
 CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
 CARBON_DIAGNOSTIC_KIND(TooManyDigits)
+CARBON_DIAGNOSTIC_KIND(TooManyTokens)
 CARBON_DIAGNOSTIC_KIND(TrailingComment)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)

diff --git a/toolchain/diagnostics/emitted_diagnostics_test.cpp b/toolchain/diagnostics/emitted_diagnostics_test.cpp
@@ -60,6 +60,10 @@ static auto IsUntestedDiagnostic(DiagnosticKind diagnostic_kind) -> bool {
       // loss in merge conflicts due to the amount of tests being changed right
       // now.
       return true;
+    case DiagnosticKind::TooManyTokens:
+      // This isn't feasible to test with a normal testcase, but is tested in
+      // lex/tokenized_buffer_test.cpp.
+      return true;
     default:
       return false;
   }

diff --git a/toolchain/lex/lex.cpp b/toolchain/lex/lex.cpp
@@ -191,6 +191,11 @@ class [[clang::internal_linkage]] Lexer {
 
   auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void;
 
+  // Perform final checking and cleanup that should be done once we have
+  // finished lexing the whole file, and before we consider the tokenized buffer
+  // to be complete.
+  auto Finalize() -> void;
+
   auto DiagnoseAndFixMismatchedBrackets() -> void;
 
   // The main entry point for dispatching through the lexer's table. This method
@@ -729,6 +734,8 @@ auto Lexer::Lex() && -> TokenizedBuffer {
   // dispatch table until everything from source_text is consumed.
   DispatchNext(*this, source_text, position);
 
+  Finalize();
+
   if (consumer_.seen_error()) {
     buffer_.has_errors_ = true;
   }
@@ -1342,11 +1349,31 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
   NoteWhitespace();
 
   LexToken(TokenKind::FileEnd, position);
+}
 
+auto Lexer::Finalize() -> void {
   // If we had any mismatched brackets, issue diagnostics and fix them.
   if (has_mismatched_brackets_ || !open_groups_.empty()) {
     DiagnoseAndFixMismatchedBrackets();
   }
+
+  // Reject source files with so many tokens that we may have exceeded the
+  // number of bits in `token_payload_`.
+  //
+  // Note that we rely on this check also catching the case where there are too
+  // many identifiers to fit an `IdentifierId` into a `token_payload_`, and
+  // likewise for `IntId` and so on. If we start adding any of those IDs prior
+  // to lexing, we may need to also limit the number of those IDs here.
+  if (buffer_.token_infos_.size() > TokenizedBuffer::MaxTokens) {
+    CARBON_DIAGNOSTIC(TooManyTokens, Error,
+                      "too many tokens in source file; try splitting into "
+                      "multiple source files");
+    // Subtract one to leave room for the `FileEnd` token.
+    token_emitter_.Emit(TokenIndex(TokenizedBuffer::MaxTokens - 1),
+                        TooManyTokens);
+    // TODO: Convert tokens after the token limit to error tokens to avoid
+    // misinterpretation by consumers of the tokenized buffer.
+  }
 }
 
 // A list of pending insertions to make into a tokenized buffer for error

diff --git a/toolchain/lex/tokenized_buffer.h b/toolchain/lex/tokenized_buffer.h
@@ -83,6 +83,10 @@ class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
 // `HasError` returning true.
 class TokenizedBuffer : public Printable<TokenizedBuffer> {
  public:
+  // The maximum number of tokens that can be stored in the buffer, including
+  // the FileStart and FileEnd tokens.
+  static constexpr int MaxTokens = 1 << 23;
+
   // A comment, which can be a block of lines.
   //
   // This is the API version of `CommentData`.
@@ -306,7 +310,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     }
     auto set_ident_id(IdentifierId ident_id) -> void {
       CARBON_DCHECK(kind() == TokenKind::Identifier);
-      CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
       token_payload_ = ident_id.index;
     }
 
@@ -334,7 +337,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     }
     auto set_closing_token_index(TokenIndex closing_index) -> void {
       CARBON_DCHECK(kind().is_opening_symbol());
-      CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
       token_payload_ = closing_index.index;
     }
 
@@ -344,7 +346,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     }
     auto set_opening_token_index(TokenIndex opening_index) -> void {
       CARBON_DCHECK(kind().is_closing_symbol());
-      CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
       token_payload_ = opening_index.index;
     }
 
@@ -395,18 +396,23 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
         : kind_(kind),
           has_leading_space_(has_leading_space),
           token_payload_(payload),
-          byte_offset_(byte_offset) {
-      CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
-                    "Payload won't fit into unsigned bit pack: {0}", payload);
-    }
+          byte_offset_(byte_offset) {}
 
     // A bitfield that encodes the token's kind, the leading space flag, and the
     // remaining bits in a payload. These are encoded together as a bitfield for
     // density and because these are the hottest fields of tokens for consumers
     // after lexing.
+    //
+    // Payload values are typically ID types for which we create at most one per
-    // Payload values are typically ID types for which we create at most one per
+
+    // Payload values are typically ID types for which we create at most one per
-    // Payload values are typically ID types for which we create at most one per
+
+    // Payload values are typically ID types for which we create at most one per
+    // token, so we ensure that `token_payload_` is large enough to fit any
+    // token index. Stores to this field may overflow, but we produce an error
+    // in `Lexer::Finalize` if the file has more than `MaxTokens` tokens, so
+    // this value never overflows if lexing succeeds.
     TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
     bool has_leading_space_ : 1;
     unsigned token_payload_ : PayloadBits;
+    static_assert(MaxTokens <= 1 << PayloadBits,
+                  "Not enough payload bits to store a token index");
 
     // Separate storage for the byte offset, this is hot while lexing but then
     // generally cold.

diff --git a/toolchain/lex/tokenized_buffer_test.cpp b/toolchain/lex/tokenized_buffer_test.cpp
@@ -1107,6 +1107,21 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
   compile_helper_.GetTokenizedBuffer("\b", &consumer);
 }
 
+TEST_F(LexerTest, DiagnosticFileTooLarge) {
+  Testing::MockDiagnosticConsumer consumer;
+  static constexpr size_t NumLines = 10'000'000;
+  std::string input;
+  input.reserve(NumLines * 3);
+  for ([[maybe_unused]] int _ : llvm::seq(NumLines)) {
+    input += "{}\n";
+  }
+  EXPECT_CALL(consumer,
+              HandleDiagnostic(IsSingleDiagnostic(
+                  DiagnosticKind::TooManyTokens, DiagnosticLevel::Error,
+                  TokenizedBuffer::MaxTokens / 2, 1, _)));
+  compile_helper_.GetTokenizedBuffer(input, &consumer);
+}
+
 // Appends comment lines to the string, to create a comment block.
 static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
     -> void {