diff --git a/lib/tre-ast.c b/lib/tre-ast.c index acb387a..5a4bb19 100644 --- a/lib/tre-ast.c +++ b/lib/tre-ast.c @@ -33,7 +33,7 @@ tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) } tre_ast_node_t * -tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max) { tre_ast_node_t *node; tre_literal_t *lit; @@ -44,7 +44,7 @@ tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) lit = node->obj; lit->code_min = code_min; lit->code_max = code_max; - lit->position = position; + lit->position = -1; return node; } diff --git a/lib/tre-ast.h b/lib/tre-ast.h index d9af376..4f9589e 100644 --- a/lib/tre-ast.h +++ b/lib/tre-ast.h @@ -101,7 +101,7 @@ tre_ast_node_t * tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size); tre_ast_node_t * -tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position); +tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max); tre_ast_node_t * tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, diff --git a/lib/tre-compile.c b/lib/tre-compile.c index ebccfb8..b84399c 100644 --- a/lib/tre-compile.c +++ b/lib/tre-compile.c @@ -46,7 +46,7 @@ tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) c = tre_mem_alloc(mem, sizeof(*c)); if (c == NULL) return REG_ESPACE; - c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); + c->left = tre_ast_new_literal(mem, TAG, tag_id); if (c->left == NULL) return REG_ESPACE; c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); @@ -78,7 +78,7 @@ tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) c = tre_mem_alloc(mem, sizeof(*c)); if (c == NULL) return REG_ESPACE; - c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); + c->right = tre_ast_new_literal(mem, TAG, tag_id); if (c->right == NULL) return REG_ESPACE; c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); @@ -711,7 +711,7 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, tag_directions[max] = TRE_TAG_MAXIMIZE; first_tag = 0; } - *result = tre_ast_new_literal(mem, min, max, pos); + *result = tre_ast_new_literal(mem, min, max); if (*result == NULL) status = REG_ESPACE; @@ -798,8 +798,7 @@ typedef enum { iteration count to a catenated sequence of copies of the node. */ static reg_errcode_t tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, - int *position, tre_tag_direction_t *tag_directions, - int *max_depth) + tre_tag_direction_t *tag_directions, int *max_depth) { reg_errcode_t status = REG_OK; int bottom = tre_stack_num_objects(stack); @@ -949,7 +948,7 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, seq2 = copy; if (seq2 == NULL) return REG_ESPACE; - tmp = tre_ast_new_literal(mem, EMPTY, -1, -1); + tmp = tre_ast_new_literal(mem, EMPTY, -1); if (tmp == NULL) return REG_ESPACE; seq2 = tre_ast_new_union(mem, tmp, seq2); @@ -983,12 +982,12 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy; int *old_params; - tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1); + tmp_l = tre_ast_new_literal(mem, PARAMETER, 0); if (!tmp_l) return REG_ESPACE; ((tre_literal_t *)tmp_l->obj)->u.params = iter->params; iter->params[TRE_PARAM_DEPTH] = params_depth + 1; - tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1); + tmp_r = tre_ast_new_literal(mem, PARAMETER, 0); if (!tmp_r) return REG_ESPACE; old_params = tre_mem_alloc(mem, sizeof(*old_params) @@ -1028,19 +1027,9 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, } } - *position += pos_add_total; - - /* `max_pos' should never be larger than `*position' if the above - code works, but just an extra safeguard let's make sure - `*position' is set large enough so enough memory will be - allocated for the transition table. */ - if (max_pos > *position) - *position = max_pos; - #ifdef TRE_DEBUG DPRINT(("Expanded AST:\n")); tre_ast_print(ast); - DPRINT(("*position %d, max_pos %d\n", *position, max_pos)); #endif return status; @@ -1305,33 +1294,36 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, typedef enum { - NFL_RECURSE, - NFL_POST_UNION, - NFL_POST_CATENATION, - NFL_POST_ITERATION -} tre_nfl_stack_symbol_t; + NPFL_RECURSE, + NPFL_POST_UNION, + NPFL_POST_CATENATION, + NPFL_POST_ITERATION +} tre_npfl_stack_symbol_t; -/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for - the nodes of the AST `tree'. */ +/* Computes and fills in the fields `nullable', `position`, `firstpos', + and `lastpos' for the nodes of the AST `tree'; `nextpos' points to an + integer indicating the next available position, and will be updated on + return to reflect the number of additional positions assigned. */ static reg_errcode_t -tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) +tre_compute_npfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, + int *nextpos) { int bottom = tre_stack_num_objects(stack); STACK_PUSHR(stack, voidptr, tree); - STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, int, NPFL_RECURSE); while (tre_stack_num_objects(stack) > bottom) { - tre_nfl_stack_symbol_t symbol; + tre_npfl_stack_symbol_t symbol; tre_ast_node_t *node; - symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack); + symbol = (tre_npfl_stack_symbol_t)tre_stack_pop_int(stack); node = tre_stack_pop_voidptr(stack); switch (symbol) { - case NFL_RECURSE: + case NPFL_RECURSE: switch (node->type) { case LITERAL: @@ -1342,6 +1334,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) /* Back references: nullable = false, firstpos = {i}, lastpos = {i}. */ node->nullable = 0; + lit->position = (*nextpos)++; node->firstpos = tre_set_one(mem, lit->position, 0, TRE_CHAR_MAX, 0, NULL, -1); if (!node->firstpos) @@ -1369,6 +1362,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) /* Literal at position i: nullable = false, firstpos = {i}, lastpos = {i}. */ node->nullable = 0; + lit->position = (*nextpos)++; node->firstpos = tre_set_one(mem, lit->position, (int)lit->code_min, (int)lit->code_max, 0, NULL, -1); @@ -1389,36 +1383,36 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) /* Compute the attributes for the two subtrees, and after that for this node. */ STACK_PUSHR(stack, voidptr, node); - STACK_PUSHR(stack, int, NFL_POST_UNION); + STACK_PUSHR(stack, int, NPFL_POST_UNION); STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); - STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, int, NPFL_RECURSE); STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); - STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, int, NPFL_RECURSE); break; case CATENATION: /* Compute the attributes for the two subtrees, and after that for this node. */ STACK_PUSHR(stack, voidptr, node); - STACK_PUSHR(stack, int, NFL_POST_CATENATION); + STACK_PUSHR(stack, int, NPFL_POST_CATENATION); STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); - STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, int, NPFL_RECURSE); STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); - STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, int, NPFL_RECURSE); break; case ITERATION: /* Compute the attributes for the subtree, and after that for this node. */ STACK_PUSHR(stack, voidptr, node); - STACK_PUSHR(stack, int, NFL_POST_ITERATION); + STACK_PUSHR(stack, int, NPFL_POST_ITERATION); STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); - STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, int, NPFL_RECURSE); break; } - break; /* end case: NFL_RECURSE */ + break; /* end case: NPFL_RECURSE */ - case NFL_POST_UNION: + case NPFL_POST_UNION: { tre_union_t *uni = (tre_union_t *)node->obj; node->nullable = uni->left->nullable || uni->right->nullable; @@ -1433,7 +1427,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) break; } - case NFL_POST_ITERATION: + case NPFL_POST_ITERATION: { tre_iteration_t *iter = (tre_iteration_t *)node->obj; @@ -1446,7 +1440,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) break; } - case NFL_POST_CATENATION: + case NPFL_POST_CATENATION: { int num_tags, *tags, assertions, params_seen; int *params; @@ -1839,7 +1833,6 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, return errcode; } - #define ERROR_EXIT(err) \ do \ { \ @@ -1864,6 +1857,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) tre_tag_direction_t *tag_directions = NULL; reg_errcode_t errcode; tre_mem_t mem; + int numpos = 0; /* Parse context. */ tre_parse_ctx_t parse_ctx; @@ -1970,8 +1964,8 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) } /* Expand iteration nodes. */ - errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, - tag_directions, &tnfa->params_depth); + errcode = tre_expand_ast(mem, stack, tree, tag_directions, + &tnfa->params_depth); if (errcode != REG_OK) ERROR_EXIT(errcode); @@ -1980,7 +1974,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) for example "a*" or "ab*". Figure out a simple way to detect this possibility. */ tmp_ast_l = tree; - tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++); + tmp_ast_r = tre_ast_new_literal(mem, 0, 0); if (tmp_ast_r == NULL) ERROR_EXIT(REG_ESPACE); @@ -1988,29 +1982,29 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (tree == NULL) ERROR_EXIT(REG_ESPACE); + errcode = tre_compute_npfl(mem, stack, tree, &numpos); + if (errcode != REG_OK) + ERROR_EXIT(errcode); + #ifdef TRE_DEBUG tre_ast_print(tree); - DPRINT(("Number of states: %d\n", parse_ctx.position)); + DPRINT(("Number of states: %d\n", numpos)); #endif /* TRE_DEBUG */ - errcode = tre_compute_nfl(mem, stack, tree); - if (errcode != REG_OK) - ERROR_EXIT(errcode); - - counts = xmalloc(sizeof(int) * parse_ctx.position); + counts = xmalloc(sizeof(int) * numpos); if (counts == NULL) ERROR_EXIT(REG_ESPACE); - offs = xmalloc(sizeof(int) * parse_ctx.position); + offs = xmalloc(sizeof(int) * numpos); if (offs == NULL) ERROR_EXIT(REG_ESPACE); - for (i = 0; i < parse_ctx.position; i++) + for (i = 0; i < numpos; i++) counts[i] = 0; tre_ast_to_tnfa(tree, NULL, counts, NULL); add = 0; - for (i = 0; i < parse_ctx.position; i++) + for (i = 0; i < numpos; i++) { offs[i] = add; add += counts[i] + 1; @@ -2148,7 +2142,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) tnfa->num_transitions = add; tnfa->final = transitions + offs[tree->lastpos[0].position]; - tnfa->num_states = parse_ctx.position; + tnfa->num_states = numpos; tnfa->cflags = cflags; DPRINT(("final state %p\n", (void *)tnfa->final)); diff --git a/lib/tre-parse.c b/lib/tre-parse.c index b62c022..d0540cd 100644 --- a/lib/tre-parse.c +++ b/lib/tre-parse.c @@ -113,7 +113,7 @@ tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, return REG_ESPACE; *items = array = new_items; } - array[*i] = tre_ast_new_literal(mem, min, max, -1); + array[*i] = tre_ast_new_literal(mem, min, max); status = array[*i] == NULL ? REG_ESPACE : REG_OK; (*i)++; return status; @@ -491,7 +491,6 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { int k; DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); - l->position = ctx->position; if (num_neg_classes > 0) { l->neg_classes = tre_mem_alloc(ctx->mem, @@ -527,7 +526,7 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) { int k; DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); - n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); + n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX); if (n == NULL) status = REG_ESPACE; else @@ -570,7 +569,6 @@ tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) parse_bracket_done: xfree(items); - ctx->position++; *result = node; return status; } @@ -844,7 +842,7 @@ tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) /* Create the AST node(s). */ if (min == 0 && max == 0) { - *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + *result = tre_ast_new_literal(ctx->mem, EMPTY, -1); if (*result == NULL) return REG_ESPACE; } @@ -1363,7 +1361,7 @@ tre_parse(tre_parse_ctx_t *ctx) subexpression was closed. POSIX leaves the meaning of this to be implementation-defined. We interpret this as an empty expression (which matches an empty string). */ - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + result = tre_ast_new_literal(ctx->mem, EMPTY, -1); if (result == NULL) return REG_ESPACE; if (!(ctx->cflags & REG_EXTENDED)) @@ -1411,7 +1409,6 @@ tre_parse(tre_parse_ctx_t *ctx) if (status != REG_OK) return status; ctx->re += 2; - ctx->position = subctx.position; result = subctx.result; break; } @@ -1440,22 +1437,22 @@ tre_parse(tre_parse_ctx_t *ctx) { case L'b': result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_WB, -1); + ASSERT_AT_WB); ctx->re++; break; case L'B': result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_WB_NEG, -1); + ASSERT_AT_WB_NEG); ctx->re++; break; case L'<': result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOW, -1); + ASSERT_AT_BOW); ctx->re++; break; case L'>': result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOW, -1); + ASSERT_AT_EOW); ctx->re++; break; case L'x': @@ -1479,9 +1476,7 @@ tre_parse(tre_parse_ctx_t *ctx) ctx->re++; } val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, - (int)val, ctx->position); - ctx->position++; + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); break; } else if (ctx->re < ctx->re_end) @@ -1507,9 +1502,7 @@ tre_parse(tre_parse_ctx_t *ctx) ctx->re++; tmp[i] = 0; val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, - ctx->position); - ctx->position++; + result = tre_ast_new_literal(ctx->mem, (int)val, (int)val); break; } /*FALLTHROUGH*/ @@ -1521,11 +1514,9 @@ tre_parse(tre_parse_ctx_t *ctx) int val = *ctx->re - L'0'; DPRINT(("tre_parse: backref: '%.*" STRF "'\n", REST(ctx->re - 1))); - result = tre_ast_new_literal(ctx->mem, BACKREF, val, - ctx->position); + result = tre_ast_new_literal(ctx->mem, BACKREF, val); if (result == NULL) return REG_ESPACE; - ctx->position++; ctx->max_backref = MAX(val, ctx->max_backref); ctx->re++; } @@ -1534,9 +1525,7 @@ tre_parse(tre_parse_ctx_t *ctx) /* Escaped character. */ DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", REST(ctx->re - 1))); - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); - ctx->position++; + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re); ctx->re++; } break; @@ -1552,26 +1541,21 @@ tre_parse(tre_parse_ctx_t *ctx) { tre_ast_node_t *tmp1; tre_ast_node_t *tmp2; - tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1, - ctx->position); + tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1); if (!tmp1) return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX, - ctx->position + 1); + tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX); if (!tmp2) return REG_ESPACE; result = tre_ast_new_union(ctx->mem, tmp1, tmp2); if (!result) return REG_ESPACE; - ctx->position += 2; } else { - result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, - ctx->position); + result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX); if (!result) return REG_ESPACE; - ctx->position++; } ctx->re++; break; @@ -1588,7 +1572,7 @@ tre_parse(tre_parse_ctx_t *ctx) DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", REST(ctx->re))); result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOL, -1); + ASSERT_AT_BOL); if (result == NULL) return REG_ESPACE; ctx->re++; @@ -1609,7 +1593,7 @@ tre_parse(tre_parse_ctx_t *ctx) DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", REST(ctx->re))); result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOL, -1); + ASSERT_AT_EOL); if (result == NULL) return REG_ESPACE; ctx->re++; @@ -1656,7 +1640,7 @@ tre_parse(tre_parse_ctx_t *ctx) { DPRINT(("tre_parse: empty: '%.*" STRF "'\n", REST(ctx->re))); - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + result = tre_ast_new_literal(ctx->mem, EMPTY, -1); if (!result) return REG_ESPACE; break; @@ -1682,13 +1666,11 @@ tre_parse(tre_parse_ctx_t *ctx) could be several opposite-case counterpoints, but they cannot be supported portably anyway. */ tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), - tre_toupper(*ctx->re), - ctx->position); + tre_toupper(*ctx->re)); if (!tmp1) return REG_ESPACE; tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), - tre_tolower(*ctx->re), - ctx->position); + tre_tolower(*ctx->re)); if (!tmp2) return REG_ESPACE; result = tre_ast_new_union(ctx->mem, tmp1, tmp2); @@ -1697,12 +1679,10 @@ tre_parse(tre_parse_ctx_t *ctx) } else { - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); + result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re); if (!result) return REG_ESPACE; } - ctx->position++; ctx->re++; break; } @@ -1715,7 +1695,7 @@ tre_parse(tre_parse_ctx_t *ctx) if (result->submatch_id >= 0) { tre_ast_node_t *n, *tmp_node; - n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + n = tre_ast_new_literal(ctx->mem, EMPTY, -1); if (n == NULL) return REG_ESPACE; tmp_node = tre_ast_new_catenation(ctx->mem, n, result); diff --git a/lib/tre-parse.h b/lib/tre-parse.h index 3a7b7e0..f519714 100644 --- a/lib/tre-parse.h +++ b/lib/tre-parse.h @@ -26,8 +26,6 @@ typedef struct { int len; /* Current submatch ID. */ int submatch_id; - /* Current position (number of literal). */ - int position; /* The highest back reference or -1 if none seen so far. */ int max_backref; /* This flag is set if the regexp uses approximate matching. */ diff --git a/tests/retest.c b/tests/retest.c index 21b1e56..b31f160 100644 --- a/tests/retest.c +++ b/tests/retest.c @@ -1511,6 +1511,13 @@ main(int argc, char **argv) test_exec("abbabbbabaabbbbbbbbbbbba", 0, REG_OK, 0, 24, 0, 10, 10, 22, END); + test_comp("^((a{1,2})?x)*y", REG_EXTENDED | REG_NOSUB, REG_OK); + test_exec("y", 0, REG_OK, END); + test_exec("xy", 0, REG_OK, END); + test_exec("axy", 0, REG_OK, END); + test_exec("aaxy", 0, REG_OK, END); + test_exec("aaaxy", 0, REG_NOMATCH, END); + /* Test repeating something that has submatches inside. */ test_comp("(a){0,5}", REG_EXTENDED, 0); test_exec("", 0, REG_OK, 0, 0, -1, -1, END);