diff --git a/bootstrap/bootstrap.c b/bootstrap/bootstrap.c index c04de981..7c3f3e87 100644 --- a/bootstrap/bootstrap.c +++ b/bootstrap/bootstrap.c @@ -405,7 +405,7 @@ char* dlerror(void); // --- module git_version --- -#define git_version_Describe "6ac702e" +#define git_version_Describe "d488380-dirty" // --- module file_utils --- @@ -5677,7 +5677,6 @@ struct c2_tokenizer_Tokenizer_ { uint32_t next_head; const char* line_start; string_pool_Pool* pool; - string_buffer_Buf* buf; c2_tokenizer_Feature feature_stack[6]; uint32_t feature_count; const string_list_List* features; @@ -6016,7 +6015,7 @@ static const uint8_t c2_tokenizer_Identifier_char[128] = { }; static const c2_tokenizer_Keyword* c2_tokenizer_check_keyword(const char* cp); -static void c2_tokenizer_Tokenizer_init(c2_tokenizer_Tokenizer* t, string_pool_Pool* pool, string_buffer_Buf* buf, const char* input, src_loc_SrcLoc loc_start, const string_list_List* features, bool raw_mode); +static void c2_tokenizer_Tokenizer_init(c2_tokenizer_Tokenizer* t, string_pool_Pool* pool, const char* input, src_loc_SrcLoc loc_start, const string_list_List* features, bool raw_mode); static void c2_tokenizer_Tokenizer_lex(c2_tokenizer_Tokenizer* t, token_Token* result); static void c2_tokenizer_Tokenizer_lex_internal(c2_tokenizer_Tokenizer* t, token_Token* result); static token_Token c2_tokenizer_Tokenizer_lookahead(c2_tokenizer_Tokenizer* t, uint32_t n); @@ -6032,7 +6031,6 @@ static void c2_tokenizer_Tokenizer_lex_floating_point(c2_tokenizer_Tokenizer* t, static uint32_t c2_tokenizer_Tokenizer_lex_escaped_char(c2_tokenizer_Tokenizer* t, token_Token* result); static void c2_tokenizer_Tokenizer_lex_char_literal(c2_tokenizer_Tokenizer* t, token_Token* result); static void c2_tokenizer_Tokenizer_lex_string_literal(c2_tokenizer_Tokenizer* t, token_Token* result); -static bool c2_tokenizer_Tokenizer_lex_string_literal_multi(c2_tokenizer_Tokenizer* t, token_Token* result, uint32_t* num_escapes); static bool c2_tokenizer_Tokenizer_lex_line_comment(c2_tokenizer_Tokenizer* t, token_Token* result); static bool c2_tokenizer_Tokenizer_lex_block_comment(c2_tokenizer_Tokenizer* t, token_Token* result); static bool c2_tokenizer_compare_word(const char* cur, const char* expect); @@ -6045,8 +6043,6 @@ static bool c2_tokenizer_Tokenizer_handle_else(c2_tokenizer_Tokenizer* t, token_ static bool c2_tokenizer_Tokenizer_handle_endif(c2_tokenizer_Tokenizer* t, token_Token* result); static bool c2_tokenizer_Tokenizer_skip_feature(c2_tokenizer_Tokenizer* t, token_Token* result); static void c2_tokenizer_Tokenizer_skip_string_literal(c2_tokenizer_Tokenizer* t); -static bool c2_tokenizer_Tokenizer_is_multi_string(c2_tokenizer_Tokenizer* t); -static bool c2_tokenizer_Tokenizer_skip_to_next_string(c2_tokenizer_Tokenizer* t, token_Token* result); static const c2_tokenizer_Keyword* c2_tokenizer_check_keyword(const char* cp) { @@ -6076,15 +6072,14 @@ static const c2_tokenizer_Keyword* c2_tokenizer_check_keyword(const char* cp) return NULL; } -static void c2_tokenizer_Tokenizer_init(c2_tokenizer_Tokenizer* t, string_pool_Pool* pool, string_buffer_Buf* buf, const char* input, src_loc_SrcLoc loc_start, const string_list_List* features, bool raw_mode) +static void c2_tokenizer_Tokenizer_init(c2_tokenizer_Tokenizer* t, string_pool_Pool* pool, const char* input, src_loc_SrcLoc loc_start, const string_list_List* features, bool raw_mode) { - memset(t, 0, 856); + memset(t, 0, 848); t->cur = input; t->input_start = input; t->loc_start = loc_start; t->line_start = input; t->pool = pool; - t->buf = buf; t->features = features; t->raw_mode = raw_mode; for (uint32_t i = 0; (i < c2_tokenizer_MaxLookahead); i++) { @@ -6422,8 +6417,8 @@ static void c2_tokenizer_Tokenizer_lex_internal(c2_tokenizer_Tokenizer* t, token static token_Token c2_tokenizer_Tokenizer_lookahead(c2_tokenizer_Tokenizer* t, uint32_t n) { - c2_assert(((n > 0)) != 0, "parser/c2_tokenizer.c2:801: c2_tokenizer.Tokenizer.lookahead", "n>0"); - c2_assert(((n <= c2_tokenizer_MaxLookahead)) != 0, "parser/c2_tokenizer.c2:802: c2_tokenizer.Tokenizer.lookahead", "n<=MaxLookahead"); + c2_assert(((n > 0)) != 0, "parser/c2_tokenizer.c2:799: c2_tokenizer.Tokenizer.lookahead", "n>0"); + c2_assert(((n <= c2_tokenizer_MaxLookahead)) != 0, "parser/c2_tokenizer.c2:800: c2_tokenizer.Tokenizer.lookahead", "n<=MaxLookahead"); while ((t->next_count < n)) { const uint32_t slot = (((t->next_head + t->next_count)) % c2_tokenizer_MaxLookahead); c2_tokenizer_Tokenizer_lex_internal(t, &t->next[slot]); @@ -6626,6 +6621,10 @@ static uint32_t c2_tokenizer_Tokenizer_lex_escaped_char(c2_tokenizer_Tokenizer* c2_tokenizer_Tokenizer_error(t, result, "expect hexadecimal number after '\\x'"); return 0; } + if (isxdigit(input[3])) { + c2_tokenizer_Tokenizer_error(t, result, "too many digits in hexadecimal escape sequence '\\x'"); + return 0; + } result->char_value = ((c2_tokenizer_hex2val(input[1]) * 16) + c2_tokenizer_hex2val(input[2])); result->radix = 16; return 3; @@ -6633,12 +6632,12 @@ static uint32_t c2_tokenizer_Tokenizer_lex_escaped_char(c2_tokenizer_Tokenizer* if (c2_tokenizer_is_octal(input[0])) { uint32_t offset = 0; uint32_t value = 0; - while ((c2_tokenizer_is_octal(input[offset]) && (offset <= 2))) { + while ((c2_tokenizer_is_octal(input[offset]) && (offset < 3))) { value *= 8; value += ((uint32_t)((input[offset] - '0'))); offset++; } - if ((value > 127)) { + if ((value > 255)) { t->cur++; c2_tokenizer_Tokenizer_error(t, result, "octal escape sequence out of range"); return 0; @@ -6685,7 +6684,6 @@ static void c2_tokenizer_Tokenizer_lex_string_literal(c2_tokenizer_Tokenizer* t, result->loc = (t->loc_start + ((src_loc_SrcLoc)((t->cur - t->input_start)))); t->cur++; const char* start = t->cur; - uint32_t len; uint32_t num_escapes = 0; while (1) { switch (*t->cur) { @@ -6705,70 +6703,18 @@ static void c2_tokenizer_Tokenizer_lex_string_literal(c2_tokenizer_Tokenizer* t, t->cur += ((esc_len + 1)); break; } - case '"': - goto out; - default: + case '"': { + uint32_t len = ((uint32_t)((t->cur - start))); t->cur++; - break; - } - } - out: - len = ((uint32_t)((t->cur - start))); - t->cur++; - if ((!t->raw_mode && c2_tokenizer_Tokenizer_is_multi_string(t))) { - string_buffer_Buf_clear(t->buf); - string_buffer_Buf_add2(t->buf, start, len); - while (1) { - if (!c2_tokenizer_Tokenizer_skip_to_next_string(t, result)) return; - - if (!c2_tokenizer_Tokenizer_lex_string_literal_multi(t, result, &num_escapes)) return; - - if (!c2_tokenizer_Tokenizer_is_multi_string(t)) break; - - } - result->text_len = ((string_buffer_Buf_size(t->buf) + 1) - num_escapes); - result->text_idx = string_pool_Pool_add(t->pool, string_buffer_Buf_data(t->buf), string_buffer_Buf_size(t->buf), false); - } else { - result->text_len = ((len + 1) - num_escapes); - result->text_idx = string_pool_Pool_add(t->pool, start, len, false); - } -} - -static bool c2_tokenizer_Tokenizer_lex_string_literal_multi(c2_tokenizer_Tokenizer* t, token_Token* result, uint32_t* num_escapes) -{ - uint32_t len; - t->cur++; - const char* start = t->cur; - while (1) { - switch (*t->cur) { - case 0: - fallthrough; - case '\r': - fallthrough; - case '\n': - t->cur--; - c2_tokenizer_Tokenizer_error(t, result, "unterminated string"); - return false; - case '\\': { - uint32_t esc_len = c2_tokenizer_Tokenizer_lex_escaped_char(t, result); - if ((esc_len == 0)) return false; - - *num_escapes += esc_len; - t->cur += ((esc_len + 1)); - break; + result->text_len = ((len + 1) - num_escapes); + result->text_idx = string_pool_Pool_add(t->pool, start, len, false); + return; } - case '"': - goto out; default: t->cur++; break; } } - out: - len = ((uint32_t)((t->cur - start))); - string_buffer_Buf_add2(t->buf, start, len); - t->cur++; - return true; } static bool c2_tokenizer_Tokenizer_lex_line_comment(c2_tokenizer_Tokenizer* t, token_Token* result) @@ -7106,58 +7052,6 @@ static void c2_tokenizer_Tokenizer_skip_string_literal(c2_tokenizer_Tokenizer* t } } -static bool c2_tokenizer_Tokenizer_is_multi_string(c2_tokenizer_Tokenizer* t) -{ - const char* c = t->cur; - while (1) { - switch (*c) { - case '\t': - fallthrough; - case '\n': - fallthrough; - case '\r': - fallthrough; - case ' ': - c++; - break; - case '"': - return true; - default: - return false; - } - } - return false; -} - -static bool c2_tokenizer_Tokenizer_skip_to_next_string(c2_tokenizer_Tokenizer* t, token_Token* result) -{ - while (1) { - switch (*t->cur) { - case '\t': - t->cur++; - break; - case '\n': - t->cur++; - t->line_start = t->cur; - break; - case '\r': - t->cur++; - if ((*t->cur != '\n')) { - c2_tokenizer_Tokenizer_error(t, result, "unexpected char 0x%02X", *t->cur); - return false; - } - t->cur++; - break; - case ' ': - t->cur++; - break; - case '"': - return true; - } - } - return true; -} - // --- module parser_utils --- @@ -7167,16 +7061,14 @@ static src_loc_SrcLoc parser_utils_getTokenEnd(const char* input, src_loc_SrcLoc { c2_tokenizer_Tokenizer tokenizer; string_pool_Pool* pool = string_pool_create(128, 20); - string_buffer_Buf* buf = string_buffer_create(1024, 0, false); string_list_List features; string_list_List_init(&features, pool); - c2_tokenizer_Tokenizer_init(&tokenizer, pool, buf, input, start, &features, false); + c2_tokenizer_Tokenizer_init(&tokenizer, pool, input, start, &features, false); token_Token result; token_Token_init(&result); c2_tokenizer_Tokenizer_lex(&tokenizer, &result); string_list_List_free(&features); string_pool_Pool_free(pool); - string_buffer_Buf_free(buf); return ((start + ((src_loc_SrcLoc)((tokenizer.cur - tokenizer.input_start)))) - 1); } @@ -23129,7 +23021,7 @@ static void c2_parser_Parser_parseAliasType(c2_parser_Parser* p, uint32_t name, static c2_parser_Parser* c2_parser_create(source_mgr_SourceMgr* sm, diagnostics_Diags* diags, string_pool_Pool* pool, ast_builder_Builder* builder, const string_list_List* features) { - c2_parser_Parser* p = calloc(1, 1128); + c2_parser_Parser* p = calloc(1, 1120); p->sm = sm; p->diags = diags; p->pool = pool; @@ -23147,10 +23039,9 @@ static void c2_parser_Parser_parse(c2_parser_Parser* p, int32_t file_id, bool is { p->file_id = file_id; p->is_interface = is_interface; - string_buffer_Buf* buf = string_buffer_create(1024, 0, false); int32_t res = setjmp(&p->jmpbuf); if ((res == 0)) { - c2_tokenizer_Tokenizer_init(&p->tokenizer, p->pool, buf, source_mgr_SourceMgr_get_content(p->sm, p->file_id), source_mgr_SourceMgr_get_offset(p->sm, p->file_id), p->features, false); + c2_tokenizer_Tokenizer_init(&p->tokenizer, p->pool, source_mgr_SourceMgr_get_content(p->sm, p->file_id), source_mgr_SourceMgr_get_offset(p->sm, p->file_id), p->features, false); token_Token_init(&p->tok); c2_parser_Parser_consumeToken(p); c2_parser_Parser_parseModule(p, is_generated); @@ -23159,7 +23050,6 @@ static void c2_parser_Parser_parse(c2_parser_Parser* p, int32_t file_id, bool is c2_parser_Parser_parseTopLevel(p); } } - string_buffer_Buf_free(buf); } static void c2_parser_Parser_consumeToken(c2_parser_Parser* p) @@ -23762,7 +23652,7 @@ static ast_UnaryOpcode c2_parser_convertTokenToUnaryOpcode(token_Kind kind) case token_Kind_Tilde: return ast_UnaryOpcode_Not; default: - c2_assert((0) != 0, "parser/c2_parser_expr.c2:220: c2_parser.convertTokenToUnaryOpcode", "0"); + c2_assert((0) != 0, "parser/c2_parser_expr.c2:225: c2_parser.convertTokenToUnaryOpcode", "0"); break; } return ast_UnaryOpcode_PreInc; @@ -23902,7 +23792,7 @@ static ast_Expr* c2_parser_Parser_parsePostfixExprSuffix(c2_parser_Parser* p, as return lhs; } } - c2_assert((0) != 0, "parser/c2_parser_expr.c2:379: c2_parser.Parser.parsePostfixExprSuffix", "0"); + c2_assert((0) != 0, "parser/c2_parser_expr.c2:384: c2_parser.Parser.parsePostfixExprSuffix", "0"); return NULL; } @@ -23993,9 +23883,33 @@ static ast_IdentifierExpr* c2_parser_Parser_parseIdentifier(c2_parser_Parser* p) static ast_Expr* c2_parser_Parser_parseStringLiteral(c2_parser_Parser* p) { - ast_Expr* e = ast_builder_Builder_actOnStringLiteral(p->builder, p->tok.loc, p->tok.text_idx, p->tok.text_len); + src_loc_SrcLoc loc = p->tok.loc; + uint32_t idx = p->tok.text_idx; + uint32_t len = p->tok.text_len; c2_parser_Parser_consumeToken(p); - return e; + while ((p->tok.kind == token_Kind_StringLiteral)) { + if ((p->tok.text_len > 1)) { + const char* p1 = string_pool_Pool_idx2str(p->pool, idx); + const char* p2 = string_pool_Pool_idx2str(p->pool, p->tok.text_idx); + size_t len1 = strlen(p1); + size_t len2 = strlen(p2); + size_t len3 = ((len1 + 3) + len2); + char* p3 = malloc((len3 + 1)); + memcpy(p3, p1, len1); + if ((((len > 1) && isxdigit(p1[(len - 1)])) && isxdigit(*p2))) { + sprintf((p3 + len1), "\\%03o", (*p2 & 0xff)); + memcpy(((p3 + len1) + 4), (p2 + 1), ((len2 - 1) + 1)); + } else { + memcpy((p3 + len1), p2, (len2 + 1)); + len3 -= 3; + } + idx = string_pool_Pool_add(p->pool, p3, len3, false); + len += (p->tok.text_len - 1); + free(p3); + } + c2_parser_Parser_consumeToken(p); + } + return ast_builder_Builder_actOnStringLiteral(p->builder, loc, idx, len); } static ast_Expr* c2_parser_Parser_parseParenExpr(c2_parser_Parser* p) @@ -24009,7 +23923,7 @@ static ast_Expr* c2_parser_Parser_parseParenExpr(c2_parser_Parser* p) static bool c2_parser_Parser_isTemplateFunctionCall(c2_parser_Parser* p) { - c2_assert(((p->tok.kind == token_Kind_Less)) != 0, "parser/c2_parser_expr.c2:501: c2_parser.Parser.isTemplateFunctionCall", "p.tok.kind==Kind.Less"); + c2_assert(((p->tok.kind == token_Kind_Less)) != 0, "parser/c2_parser_expr.c2:536: c2_parser.Parser.isTemplateFunctionCall", "p.tok.kind==Kind.Less"); uint32_t ahead = 1; token_Token t = c2_tokenizer_Tokenizer_lookahead(&p->tokenizer, ahead); if (((t.kind >= token_Kind_KW_bool) && (t.kind <= token_Kind_KW_void))) return true; @@ -35151,7 +35065,7 @@ static void qbe_generator_build(const char* output_dir) int32_t retval = process_utils_run(dir, "/usr/bin/make", qbe_generator_LogFile); if ((retval != 0)) { console_error("error during external QBE compilation"); - console_log("see %s%s for defails", dir, qbe_generator_LogFile); + console_log("see %s%s for details", dir, qbe_generator_LogFile); } } diff --git a/parser/c2_parser.c2 b/parser/c2_parser.c2 index 67d3ceac..4dac25d9 100644 --- a/parser/c2_parser.c2 +++ b/parser/c2_parser.c2 @@ -77,11 +77,9 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g p.file_id = file_id; p.is_interface = is_interface; - string_buffer.Buf* buf = string_buffer.create(1024, 0, false); #if DumpTokens u64 t1 = utils.now(); p.tokenizer.init(p.pool, - buf, p.sm.get_content(p.file_id), p.sm.get_offset(p.file_id), p.features, @@ -100,7 +98,6 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g if (res == 0) { p.tokenizer.init(p.pool, - buf, p.sm.get_content(p.file_id), p.sm.get_offset(p.file_id), p.features, @@ -117,7 +114,6 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g } } #endif - buf.free(); } fn void Parser.consumeToken(Parser* p) { diff --git a/parser/c2_parser_expr.c2 b/parser/c2_parser_expr.c2 index a2edef49..376855b3 100644 --- a/parser/c2_parser_expr.c2 +++ b/parser/c2_parser_expr.c2 @@ -20,6 +20,11 @@ import ast local; import token local; import src_loc local; +import ctype local; +import stdio local; +import stdlib local; +import string local; + /// PrecedenceLevels - These have been altered from C99 to C2 /// In particular, addition now comes after bitwise and shifts /// Bitwise is directly after shift and equality and relational have @@ -482,9 +487,39 @@ fn IdentifierExpr* Parser.parseIdentifier(Parser* p) { } fn Expr* Parser.parseStringLiteral(Parser* p) { - Expr* e = p.builder.actOnStringLiteral(p.tok.loc, p.tok.text_idx, p.tok.text_len); + SrcLoc loc = p.tok.loc; + u32 idx = p.tok.text_idx; + u32 len = p.tok.text_len; p.consumeToken(); - return e; + // concatenate multi-strings + while (p.tok.kind == Kind.StringLiteral) { + if (p.tok.text_len > 1) { + const char *p1 = p.pool.idx2str(idx); + const char *p2 = p.pool.idx2str(p.tok.text_idx); + usize len1 = strlen(p1); + usize len2 = strlen(p2); + usize len3 = len1 + 3 + len2; + char *p3 = malloc(len3 + 1); + string.memcpy(p3, p1, len1); + if (len > 1 && isxdigit(p1[len - 1]) && isxdigit(*p2)) { + // special case: prevent inadvertent escape sequence pasting + // replace first character with octal escape sequence + // note: hex escape sequence would not work for "#e" as \x23e + // is parsed as a single character by the C compiler + sprintf(p3 + len1, "\\%03o", *p2 & 0xFF); + memcpy(p3 + len1 + 4, p2 + 1, len2 - 1 + 1); + } else { + // regular case: just concatenate the strings + memcpy(p3 + len1, p2, len2 + 1); + len3 -= 3; + } + idx = p.pool.add(p3, len3, false); + len += p.tok.text_len - 1; + free(p3); + } + p.consumeToken(); + } + return p.builder.actOnStringLiteral(loc, idx, len); } fn Expr* Parser.parseParenExpr(Parser* p) { diff --git a/parser/c2_parser_stmt.c2 b/parser/c2_parser_stmt.c2 index f0d399aa..8d09ce8b 100644 --- a/parser/c2_parser_stmt.c2 +++ b/parser/c2_parser_stmt.c2 @@ -252,7 +252,6 @@ fn Stmt* Parser.parseAsmStmt(Parser* p) { p.expectAndConsume(Kind.LParen); p.expect(Kind.StringLiteral); - // TODO concatenate multiple strings Expr* str = p.parseStringLiteral(); ExprList constraints; @@ -310,7 +309,6 @@ fn Stmt* Parser.parseAsmStmt(Parser* p) { // Parse the asm-string list for clobbers if present if (p.tok.kind != Kind.RParen) { while (1) { - // TODO concatenate multiple strings p.expect(Kind.StringLiteral); Expr* e = p.parseStringLiteral(); clobbers.add(e); diff --git a/parser/c2_tokenizer.c2 b/parser/c2_tokenizer.c2 index ef9e952f..d2386e3e 100644 --- a/parser/c2_tokenizer.c2 +++ b/parser/c2_tokenizer.c2 @@ -439,7 +439,6 @@ public type Tokenizer struct { const char* line_start; string_pool.Pool* pool; // no ownership - string_buffer.Buf* buf; // no ownership, used for multi-strings: "a" "b" "c" // Feature handling Feature[constants.MaxFeatureDepth] feature_stack; @@ -452,7 +451,7 @@ public type Tokenizer struct { public fn void Tokenizer.init(Tokenizer* t, string_pool.Pool* pool, - string_buffer.Buf* buf, + //string_buffer.Buf* buf, const char* input, SrcLoc loc_start, const string_list.List* features, @@ -464,7 +463,6 @@ public fn void Tokenizer.init(Tokenizer* t, t.loc_start = loc_start; t.line_start = input; t.pool = pool; - t.buf = buf; t.features = features; t.raw_mode = raw_mode; for (u32 i=0; i(input[offset] - '0'); offset++; } - if (value > 127) { + if (value > 255) { t.cur++; t.error(result, "octal escape sequence out of range"); return 0; @@ -1069,7 +1073,6 @@ fn void Tokenizer.lex_string_literal(Tokenizer* t, Token* result) { result.loc = t.loc_start + cast(t.cur - t.input_start); t.cur++; // skip " const char* start = t.cur; - u32 len; u32 num_escapes = 0; while (1) { @@ -1087,69 +1090,17 @@ fn void Tokenizer.lex_string_literal(Tokenizer* t, Token* result) { t.cur += (esc_len + 1); break; case '"': - goto out; - default: - t.cur++; - break; - } - } -out: - len = cast(t.cur - start); - t.cur++; // skip end delimiter - - // check multi-strings "a" "b" "c", concatenate into single string - if (!t.raw_mode && t.is_multi_string()) { - t.buf.clear(); - t.buf.add2(start, len); - - while (1) { - if (!t.skip_to_next_string(result)) return; - - if (!t.lex_string_literal_multi(result, &num_escapes)) return; - - if (!t.is_multi_string()) break; - } - result.text_len = t.buf.size() + 1 - num_escapes; // include 0-terminator - result.text_idx = t.pool.add(t.buf.data(), t.buf.size(), false); - - } else { - result.text_len = len + 1 - num_escapes; // include 0-terminator - result.text_idx = t.pool.add(start, len, false); - } - // Note: we could put all empty string at index 1 (not 0, since that means nil) -} - -fn bool Tokenizer.lex_string_literal_multi(Tokenizer* t, Token* result, u32* num_escapes) { - u32 len; - t.cur++; // skip start delimiter - const char* start = t.cur; - - while (1) { - switch (*t.cur) { - case 0: - case '\r': - case '\n': - t.cur--; - t.error(result, "unterminated string"); - return false; - case '\\': - u32 esc_len = t.lex_escaped_char(result); - if (esc_len == 0) return false; - *num_escapes += esc_len; - t.cur += (esc_len + 1); - break; - case '"': - goto out; + u32 len = cast(t.cur - start); + t.cur++; // skip string terminator + // Note: we could put all empty strings at index 1 (not 0, since that means nil) + result.text_len = len + 1 - num_escapes; // include 0-terminator + result.text_idx = t.pool.add(start, len, false); + return; default: t.cur++; break; } } -out: - len = cast(t.cur - start); - t.buf.add2(start, len); - t.cur++; // skip end delimiter - return true; } fn bool Tokenizer.lex_line_comment(Tokenizer* t, Token* result) { @@ -1481,53 +1432,3 @@ fn void Tokenizer.skip_string_literal(Tokenizer* t) { } } -fn bool Tokenizer.is_multi_string(Tokenizer* t) { - // skip whitespace/newlines, return true if first other char is " - const char* c = t.cur; - while (1) { - switch (*c) { - case '\t': - case '\n': - case '\r': - case ' ': - c++; - break; - case '"': - return true; - default: - return false; - } - - } - return false; -} - -fn bool Tokenizer.skip_to_next_string(Tokenizer* t, Token* result) { - while (1) { - switch (*t.cur) { - case '\t': - t.cur++; - break; - case '\n': - t.cur++; - t.line_start = t.cur; - break; - case '\r': - t.cur++; - if (*t.cur != '\n') { - t.error(result, "unexpected char 0x%02X", *t.cur); - return false; - } - t.cur++; - break; - case ' ': - t.cur++; - break; - case '"': - return true; - } - - } - return true; -} - diff --git a/parser/parser_utils.c2 b/parser/parser_utils.c2 index 68760531..2bf4d159 100644 --- a/parser/parser_utils.c2 +++ b/parser/parser_utils.c2 @@ -25,10 +25,9 @@ import token local; public fn SrcLoc getTokenEnd(const char* input, SrcLoc start) { c2_tokenizer.Tokenizer tokenizer; string_pool.Pool* pool = string_pool.create(128, 20); - string_buffer.Buf* buf = string_buffer.create(1024, 0, false); string_list.List features; features.init(pool); - tokenizer.init(pool, buf, input, start, &features, false); + tokenizer.init(pool, input, start, &features, false); Token result; result.init(); @@ -36,7 +35,6 @@ public fn SrcLoc getTokenEnd(const char* input, SrcLoc start) { features.free(); pool.free(); - buf.free(); return start + cast(tokenizer.cur - tokenizer.input_start) - 1; } diff --git a/test/parser/char_octal_out_of_range.c2 b/test/parser/char_octal_out_of_range.c2 index 515c77c7..10a4220e 100644 --- a/test/parser/char_octal_out_of_range.c2 +++ b/test/parser/char_octal_out_of_range.c2 @@ -1,5 +1,5 @@ // @warnings{no-unused} module test; -char c = '\200'; // @error{octal escape sequence out of range} +char c = '\400'; // @error{octal escape sequence out of range} diff --git a/test/parser/multi_string.c2 b/test/parser/multi_string.c2 index e80ff6b5..96c2e0b5 100644 --- a/test/parser/multi_string.c2 +++ b/test/parser/multi_string.c2 @@ -16,10 +16,50 @@ const char[] Text = static_assert(19, sizeof(Text)); - const char[] Text2 = +const char[] Text2 = "foo\n" "bar\n" "faa\n"; static_assert(13, sizeof(Text2)); +// escape sequences should not be fused in multi-strings +const char[] Text3a = "\123"; +static_assert(2, sizeof(Text3a)); +const char[] Text3 = "\1" "23"; +static_assert(4, sizeof(Text3)); // should also check strlen(Text3) == 3 + +// multi-string parts should parse as separate tokens +const char[] Text4 = + + "abc" + + "hgi"; + +static_assert(7, sizeof(Text4)); + +// multi-string parts should parse as separate tokens +const char[] Text5 = + "abc" + // "def" + "hgi"; + +static_assert(7, sizeof(Text5)); + +const char[] Text6 = + "abc" + /* + "def" + */ + "hgi"; + +static_assert(7, sizeof(Text6)); + +const char[] Text7 = + "abc" +#if 0 + "def" +#endif + "hgi"; + +static_assert(7, sizeof(Text7)); diff --git a/tools/c2cat.c2 b/tools/c2cat.c2 index 8e394834..e0992a9e 100644 --- a/tools/c2cat.c2 +++ b/tools/c2cat.c2 @@ -229,8 +229,7 @@ public fn i32 main(i32 argc, const char** argv) c2_tokenizer.Tokenizer tokenizer; string_list.List features; input = file.char_data(); - string_buffer.Buf* buf = string_buffer.create(1024, 0, false); - tokenizer.init(pool, buf, input, 0, &features, true); + tokenizer.init(pool, input, 0, &features, true); out = string_buffer.create(16*1024, true, 2); @@ -253,7 +252,6 @@ public fn i32 main(i32 argc, const char** argv) fflush(stdout); out.free(); - buf.free(); file.close(); return 0;