Skip to content

Commit

Permalink
[Bug #20504] Move dynamic regexp concatenation to iseq compiler
Browse files Browse the repository at this point in the history
  • Loading branch information
nobu committed Jan 3, 2025
1 parent 77fe822 commit 6bbb470
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 101 deletions.
118 changes: 90 additions & 28 deletions compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -3820,6 +3820,24 @@ iseq_peephole_optimize(rb_iseq_t *iseq, LINK_ELEMENT *list, const int do_tailcal
}
ELEM_REMOVE(&iobj->link);
}
if (IS_NEXT_INSN_ID(&iobj->link, toregexp)) {
INSN *next = (INSN *)iobj->link.next;
if (OPERAND_AT(next, 1) == INT2FIX(1)) {
VALUE src = OPERAND_AT(iobj, 0);
int opt = (int)FIX2LONG(OPERAND_AT(next, 0));
VALUE path = rb_iseq_path(iseq);
int line = iobj->insn_info.line_no;
VALUE errinfo = rb_errinfo();
VALUE re = rb_reg_compile(src, opt, RSTRING_PTR(path), line);
if (NIL_P(re)) {
VALUE message = rb_attr_get(rb_errinfo(), idMesg);
rb_set_errinfo(errinfo);
COMPILE_ERROR(iseq, line, "%" PRIsVALUE, message);
}
RB_OBJ_WRITE(iseq, &OPERAND_AT(iobj, 0), re);
ELEM_REMOVE(iobj->link.next);
}
}
}

if (IS_INSN_ID(iobj, concatstrings)) {
Expand Down Expand Up @@ -4502,47 +4520,91 @@ all_string_result_p(const NODE *node)
}
}

struct dstr_ctxt {
rb_iseq_t *const iseq;
LINK_ANCHOR *const ret;
VALUE lit;
const NODE *lit_node;
int cnt;
int dregx;
};

static int
compile_dstr_fragments(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, int *cntp)
append_dstr_fragment(struct dstr_ctxt *args, const NODE *const node, rb_parser_string_t *str)
{
const struct RNode_LIST *list = RNODE_DSTR(node)->nd_next;
VALUE lit = rb_node_dstr_string_val(node);
LINK_ELEMENT *first_lit = 0;
int cnt = 0;

debugp_param("nd_lit", lit);
if (!NIL_P(lit)) {
cnt++;
if (!RB_TYPE_P(lit, T_STRING)) {
COMPILE_ERROR(ERROR_ARGS "dstr: must be string: %s",
rb_builtin_type_name(TYPE(lit)));
VALUE s = rb_str_new_mutable_parser_string(str);
if (args->dregx) {
VALUE error = rb_reg_check_preprocess(s);
if (!NIL_P(error)) {
COMPILE_ERROR(args->iseq, nd_line(node), "%" PRIsVALUE, error);
return COMPILE_NG;
}
}
if (NIL_P(args->lit)) {
args->lit = s;
args->lit_node = node;
}
else {
rb_str_buf_append(args->lit, s);
}
return COMPILE_OK;
}

static void
flush_dstr_fragment(struct dstr_ctxt *args)
{
if (!NIL_P(args->lit)) {
rb_iseq_t *iseq = args->iseq;
VALUE lit = args->lit;
args->lit = Qnil;
lit = rb_fstring(lit);
ADD_INSN1(ret, node, putobject, lit);
RB_OBJ_WRITTEN(iseq, Qundef, lit);
if (RSTRING_LEN(lit) == 0) first_lit = LAST_ELEMENT(ret);
ADD_INSN1(args->ret, args->lit_node, putobject, lit);
RB_OBJ_WRITTEN(args->iseq, Qundef, lit);
args->cnt++;
}
}

static int
compile_dstr_fragments_0(struct dstr_ctxt *args, const NODE *const node)
{
const struct RNode_LIST *list = RNODE_DSTR(node)->nd_next;
rb_parser_string_t *str = RNODE_DSTR(node)->string;

if (str) {
CHECK(append_dstr_fragment(args, node, str));
}

while (list) {
const NODE *const head = list->nd_head;
if (nd_type_p(head, NODE_STR)) {
lit = rb_node_str_string_val(head);
ADD_INSN1(ret, head, putobject, lit);
RB_OBJ_WRITTEN(iseq, Qundef, lit);
lit = Qnil;
CHECK(append_dstr_fragment(args, node, RNODE_STR(head)->string));
}
else if (nd_type_p(head, NODE_DSTR)) {
CHECK(compile_dstr_fragments_0(args, head));
}
else {
CHECK(COMPILE(ret, "each string", head));
flush_dstr_fragment(args);
rb_iseq_t *iseq = args->iseq;
CHECK(COMPILE(args->ret, "each string", head));
args->cnt++;
}
cnt++;
list = (struct RNode_LIST *)list->nd_next;
}
if (NIL_P(lit) && first_lit) {
ELEM_REMOVE(first_lit);
--cnt;
}
*cntp = cnt;
return COMPILE_OK;
}

static int
compile_dstr_fragments(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, int *cntp, int dregx)
{
struct dstr_ctxt args = {
.iseq = iseq, .ret = ret,
.lit = Qnil, .lit_node = NULL,
.cnt = 0, .dregx = dregx,
};
CHECK(compile_dstr_fragments_0(&args, node));
flush_dstr_fragment(&args);

*cntp = args.cnt;

return COMPILE_OK;
}
Expand Down Expand Up @@ -4571,7 +4633,7 @@ compile_dstr(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node)
RB_OBJ_WRITTEN(iseq, Qundef, lit);
}
else {
CHECK(compile_dstr_fragments(iseq, ret, node, &cnt));
CHECK(compile_dstr_fragments(iseq, ret, node, &cnt, FALSE));
ADD_INSN1(ret, node, concatstrings, INT2FIX(cnt));
}
return COMPILE_OK;
Expand All @@ -4593,7 +4655,7 @@ compile_dregx(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, i
return COMPILE_OK;
}

CHECK(compile_dstr_fragments(iseq, ret, node, &cnt));
CHECK(compile_dstr_fragments(iseq, ret, node, &cnt, TRUE));
ADD_INSN2(ret, node, toregexp, INT2FIX(cflag), INT2FIX(cnt));

if (popped) {
Expand Down
90 changes: 21 additions & 69 deletions parse.y
Original file line number Diff line number Diff line change
Expand Up @@ -1480,9 +1480,6 @@ static rb_ast_id_table_t *local_tbl(struct parser_params*);

static VALUE reg_compile(struct parser_params*, rb_parser_string_t*, int);
static void reg_fragment_setenc(struct parser_params*, rb_parser_string_t*, int);
int rb_parser_reg_fragment_check(struct parser_params*, rb_parser_string_t*, int, rb_parser_reg_fragment_error_func);
static void reg_fragment_error(struct parser_params *, VALUE);
#define reg_fragment_check(p, str, option) rb_parser_reg_fragment_check(p, str, option, reg_fragment_error)

static int literal_concat0(struct parser_params *p, rb_parser_string_t *head, rb_parser_string_t *tail);
static NODE *heredoc_dedent(struct parser_params*,NODE*);
Expand Down Expand Up @@ -13161,12 +13158,26 @@ symbol_append(struct parser_params *p, NODE *symbols, NODE *symbol)
return list_append(p, symbols, symbol);
}

static void
dregex_fragment_setenc(struct parser_params *p, rb_node_dregx_t *const dreg, int options)
{
if (dreg->string) {
reg_fragment_setenc(p, dreg->string, options);
}
for (struct RNode_LIST *list = dreg->nd_next; list; list = RNODE_LIST(list->nd_next)) {
NODE *frag = list->nd_head;
if (nd_type_p(frag, NODE_STR)) {
reg_fragment_setenc(p, RNODE_STR(frag)->string, options);
}
else if (nd_type_p(frag, NODE_DSTR)) {
dregex_fragment_setenc(p, RNODE_DSTR(frag), options);
}
}
}

static NODE *
new_regexp(struct parser_params *p, NODE *node, int options, const YYLTYPE *loc)
{
struct RNode_LIST *list;
NODE *prev;

if (!node) {
/* Check string is valid regex */
rb_parser_string_t *str = STRING_NEW0();
Expand All @@ -13190,37 +13201,8 @@ new_regexp(struct parser_params *p, NODE *node, int options, const YYLTYPE *loc)
nd_set_loc(node, loc);
rb_node_dregx_t *const dreg = RNODE_DREGX(node);
dreg->as.nd_cflag = options & RE_OPTION_MASK;
if (!dreg->nd_next) {
/* Check string is valid regex */
reg_compile(p, dreg->string, options);
}
else if (dreg->string) {
reg_fragment_check(p, dreg->string, options);
}
prev = node;
for (list = dreg->nd_next; list; list = RNODE_LIST(list->nd_next)) {
NODE *frag = list->nd_head;
enum node_type type = nd_type(frag);
if (type == NODE_STR || (type == NODE_DSTR && !RNODE_DSTR(frag)->nd_next)) {
rb_parser_string_t *tail = RNODE_STR(frag)->string;
if (reg_fragment_check(p, tail, options) && prev && RNODE_DREGX(prev)->string) {
rb_parser_string_t *lit = prev == node ? dreg->string : RNODE_STR(RNODE_LIST(prev)->nd_head)->string;
if (!literal_concat0(p, lit, tail)) {
return NEW_NIL(loc); /* dummy node on error */
}
rb_parser_str_resize(p, tail, 0);
RNODE_LIST(prev)->nd_next = list->nd_next;
rb_discard_node(p, list->nd_head);
rb_discard_node(p, (NODE *)list);
list = RNODE_LIST(prev);
}
else {
prev = (NODE *)list;
}
}
else {
prev = 0;
}
if (dreg->nd_next) {
dregex_fragment_setenc(p, dreg, options);
}
if (options & RE_OPTION_ONCE) {
node = NEW_ONCE(node, loc);
Expand Down Expand Up @@ -15363,13 +15345,7 @@ rb_reg_fragment_setenc(struct parser_params* p, rb_parser_string_t *str, int opt
rb_parser_enc_associate(p, str, rb_ascii8bit_encoding());
}
else if (rb_is_usascii_enc(p->enc)) {
if (!rb_parser_is_ascii_string(p, str)) {
/* raise in re.c */
rb_parser_enc_associate(p, str, rb_usascii_encoding());
}
else {
rb_parser_enc_associate(p, str, rb_ascii8bit_encoding());
}
rb_parser_enc_associate(p, str, rb_ascii8bit_encoding());
}
return 0;

Expand All @@ -15385,30 +15361,6 @@ reg_fragment_setenc(struct parser_params* p, rb_parser_string_t *str, int option
if (c) reg_fragment_enc_error(p, str, c);
}

static void
reg_fragment_error(struct parser_params* p, VALUE err)
{
compile_error(p, "%"PRIsVALUE, err);
}

#ifndef RIPPER
int
rb_parser_reg_fragment_check(struct parser_params* p, rb_parser_string_t *str, int options, rb_parser_reg_fragment_error_func error)
{
VALUE err, str2;
reg_fragment_setenc(p, str, options);
/* TODO */
str2 = rb_str_new_parser_string(str);
err = rb_reg_check_preprocess(str2);
if (err != Qnil) {
err = rb_obj_as_string(err);
error(p, err);
return 0;
}
return 1;
}
#endif

#ifndef UNIVERSAL_PARSER
typedef struct {
struct parser_params* parser;
Expand Down Expand Up @@ -15507,7 +15459,7 @@ reg_compile(struct parser_params* p, rb_parser_string_t *str, int options)
if (NIL_P(re)) {
VALUE m = rb_attr_get(rb_errinfo(), idMesg);
rb_set_errinfo(err);
reg_fragment_error(p, m);
compile_error(p, "%"PRIsVALUE, m);
return Qnil;
}
return re;
Expand Down
1 change: 0 additions & 1 deletion test/.excludes-parsey/TestM17N.rb

This file was deleted.

1 change: 0 additions & 1 deletion test/.excludes-parsey/TestMixedUnicodeEscape.rb

This file was deleted.

1 change: 0 additions & 1 deletion test/.excludes-parsey/TestRubyLiteral.rb

This file was deleted.

1 change: 0 additions & 1 deletion test/ripper/test_ripper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ def test_invalid_multibyte_character_in_regexp
assert_equal([[1, 8], :on_tstring_end, "\"", state(:EXPR_END)], lex.shift)
assert_equal([[1, 9], :on_embexpr_end, "}", state(:EXPR_END)], lex.shift)
assert_equal([[1, 10], :on_regexp_end, "/", state(:EXPR_BEG)], lex.shift)
assert_equal([[1, 11], :compile_error, "", state(:EXPR_END), "invalid multibyte character: /\\xCD/"], lex.shift)
assert_empty(lex)
end

Expand Down

0 comments on commit 6bbb470

Please sign in to comment.