Skip to content

Commit

Permalink
Fix multiple issues with multi-strings
Browse files Browse the repository at this point in the history
lexer:
* simplify lexer: do not check for multi-strings
* remove `Tokenizer.lex_string_literal_multi`, `Tokenizer.is_multi_string`,
  `Tokenizer.skip_to_next_string`

parser:
* concatenate multi-strings in `Parser.parseStringLiteral`
* this allows multi-strings to be partially commented or preprocessed
* fix incorrect parsing of `"\1" "23"` to produce 3 characters instead of 1

tests:
* add tests in test/parser/multi_string.c2 for various issues
  • Loading branch information
chqrlie committed Jan 22, 2025
1 parent 14a72a3 commit aa56870
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 114 deletions.
45 changes: 43 additions & 2 deletions parser/c2_parser_expr.c2
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ import ast local;
import token local;
import src_loc local;

import ctype local;
import stdio;
import string;

/// PrecedenceLevels - These have been altered from C99 to C2
/// In particular, addition now comes after bitwise and shifts
/// Bitwise is directly after shift and equality and relational have
Expand Down Expand Up @@ -482,9 +486,46 @@ fn IdentifierExpr* Parser.parseIdentifier(Parser* p) {
}

fn Expr* Parser.parseStringLiteral(Parser* p) {
Expr* e = p.builder.actOnStringLiteral(p.tok.loc, p.tok.text_idx, p.tok.text_len);
SrcLoc loc = p.tok.loc;
u32 idx = p.tok.text_idx;
u32 len = p.tok.text_len;
p.consumeToken();
return e;
// concatenate multi-strings
if (p.tok.kind == Kind.StringLiteral) {
char[64*1024] tmp;
const char *p1 = p.pool.idx2str(idx);
usize len1 = string.strlen(p1);
if (len1 >= sizeof(tmp)) {
p.error("multi-string literal too long");
}
string.memcpy(tmp, p1, len1 + 1);

while (p.tok.kind == Kind.StringLiteral) {
if (p.tok.text_len > 1) {
const char *p2 = p.pool.idx2str(p.tok.text_idx);
usize len2 = string.strlen(p2);
if (len1 + len2 + 3 >= sizeof(tmp)) {
p.error("multi-string literal too long");
}
if (len1 > 0 && isxdigit(p1[len1 - 1]) && isxdigit(*p2)) {
// special case: prevent inadvertent escape sequence pasting
// replace first character with octal escape sequence
// note: hex escape sequence would not work for "#e" as \x23e
// is parsed as a single (invalid) character by the C compiler
stdio.sprintf(tmp + len1, "\\%03o", *p2 & 0xFF);
len1 += 4;
p2 += 1;
len2 -= 1;
}
string.memcpy(tmp + len1, p2, len2 + 1);
len1 += len2;
len += p.tok.text_len - 1;
}
p.consumeToken();
}
idx = p.pool.add(tmp, len1, false);
}
return p.builder.actOnStringLiteral(loc, idx, len);
}

fn Expr* Parser.parseParenExpr(Parser* p) {
Expand Down
2 changes: 0 additions & 2 deletions parser/c2_parser_stmt.c2
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ fn Stmt* Parser.parseAsmStmt(Parser* p) {
p.expectAndConsume(Kind.LParen);

p.expect(Kind.StringLiteral);
// TODO concatenate multiple strings
Expr* str = p.parseStringLiteral();

ExprList constraints;
Expand Down Expand Up @@ -310,7 +309,6 @@ fn Stmt* Parser.parseAsmStmt(Parser* p) {
// Parse the asm-string list for clobbers if present
if (p.tok.kind != Kind.RParen) {
while (1) {
// TODO concatenate multiple strings
p.expect(Kind.StringLiteral);
Expr* e = p.parseStringLiteral();
clobbers.add(e);
Expand Down
115 changes: 6 additions & 109 deletions parser/c2_tokenizer.c2
Original file line number Diff line number Diff line change
Expand Up @@ -1187,7 +1187,6 @@ fn void Tokenizer.lex_string_literal(Tokenizer* t, Token* result) {
result.loc = t.loc_start + cast<SrcLoc>(t.cur - t.input_start);
t.cur++; // skip "
const char* start = t.cur;
u32 len;
u32 num_escapes = 0;

while (1) {
Expand All @@ -1205,69 +1204,17 @@ fn void Tokenizer.lex_string_literal(Tokenizer* t, Token* result) {
t.cur += (esc_len + 1);
break;
case '"':
goto out;
default:
t.cur++;
break;
}
}
out:
len = cast<u32>(t.cur - start);
t.cur++; // skip end delimiter

// check multi-strings "a" "b" "c", concatenate into single string
if (!t.raw_mode && t.is_multi_string()) {
t.buf.clear();
t.buf.add2(start, len);

while (1) {
if (!t.skip_to_next_string(result)) return;

if (!t.lex_string_literal_multi(result, &num_escapes)) return;

if (!t.is_multi_string()) break;
}
result.text_len = t.buf.size() + 1 - num_escapes; // include 0-terminator
result.text_idx = t.pool.add(t.buf.data(), t.buf.size(), false);

} else {
result.text_len = len + 1 - num_escapes; // include 0-terminator
result.text_idx = t.pool.add(start, len, false);
}
// Note: we could put all empty string at index 1 (not 0, since that means nil)
}

fn bool Tokenizer.lex_string_literal_multi(Tokenizer* t, Token* result, u32* num_escapes) {
u32 len;
t.cur++; // skip start delimiter
const char* start = t.cur;

while (1) {
switch (*t.cur) {
case 0:
case '\r':
case '\n':
t.cur--;
t.error(result, "unterminated string");
return false;
case '\\':
u32 esc_len = t.lex_escaped_char(result);
if (esc_len == 0) return false;
*num_escapes += esc_len;
t.cur += (esc_len + 1);
break;
case '"':
goto out;
u32 len = cast<u32>(t.cur - start);
t.cur++; // skip string terminator
// Note: we could put all empty strings at index 1 (not 0, since that means nil)
result.text_len = len + 1 - num_escapes; // include 0-terminator
result.text_idx = t.pool.add(start, len, false);
return;
default:
t.cur++;
break;
}
}
out:
len = cast<u32>(t.cur - start);
t.buf.add2(start, len);
t.cur++; // skip end delimiter
return true;
}

fn bool Tokenizer.lex_line_comment(Tokenizer* t, Token* result) {
Expand Down Expand Up @@ -1599,56 +1546,6 @@ fn void Tokenizer.skip_string_literal(Tokenizer* t) {
}
}

fn bool Tokenizer.is_multi_string(Tokenizer* t) {
// skip whitespace/newlines, return true if first other char is "
const char* c = t.cur;
while (1) {
switch (*c) {
case '\t':
case '\n':
case '\r':
case ' ':
c++;
break;
case '"':
return true;
default:
return false;
}

}
return false;
}

fn bool Tokenizer.skip_to_next_string(Tokenizer* t, Token* result) {
while (1) {
switch (*t.cur) {
case '\t':
t.cur++;
break;
case '\n':
t.cur++;
t.line_start = t.cur;
break;
case '\r':
t.cur++;
if (*t.cur != '\n') {
t.error(result, "unexpected char 0x%02X", *t.cur);
return false;
}
t.cur++;
break;
case ' ':
t.cur++;
break;
case '"':
return true;
}

}
return true;
}

/* decode a UTF-8 sequence:
return -1 on encoding error.
otherwise return codepoint value in range 0..0x1FFFFF and update *endp
Expand Down
42 changes: 41 additions & 1 deletion test/parser/multi_string.c2
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,50 @@ const char[] Text =

static_assert(19, sizeof(Text));

const char[] Text2 =
const char[] Text2 =
"foo\n"
"bar\n"
"faa\n";

static_assert(13, sizeof(Text2));

// escape sequences should not be fused in multi-strings
const char[] Text3a = "\123";
static_assert(2, sizeof(Text3a));
const char[] Text3 = "\1" "23";
static_assert(4, sizeof(Text3)); // should also check strlen(Text3) == 3

// multi-string parts should parse as separate tokens
const char[] Text4 =
// // should accept \f (form feed) as whitespace
"abc"
//
"hgi";

static_assert(7, sizeof(Text4));

// multi-string parts should parse as separate tokens
const char[] Text5 =
"abc"
// "def"
"hgi";

static_assert(7, sizeof(Text5));

const char[] Text6 =
"abc"
/*
"def"
*/
"hgi";

static_assert(7, sizeof(Text6));

const char[] Text7 =
"abc"
#if 0
"def"
#endif
"hgi";

static_assert(7, sizeof(Text7));

0 comments on commit aa56870

Please sign in to comment.