diff --git a/slice/edit.go b/slice/edit.go new file mode 100644 index 0000000..f5884ad --- /dev/null +++ b/slice/edit.go @@ -0,0 +1,163 @@ +package slice + +import ( + "fmt" +) + +// LCS computes a longest common subsequence of as and bs. +func LCS[T comparable, Slice ~[]T](as, bs Slice) Slice { + return lcsRec(as, bs, make(map[[2]int]Slice)) +} + +func lcsRec[T comparable, Slice ~[]T](as, bs Slice, m map[[2]int]Slice) Slice { + if len(as) == 0 || len(bs) == 0 { + return nil + } + na, nb := len(as), len(bs) + if v, ok := m[[2]int{na, nb}]; ok { + return v + } + if as[na-1] == bs[nb-1] { + ans := append(lcsRec(as[:na-1], bs[:nb-1], m), as[na-1]) + m[[2]int{na, nb}] = ans + return ans + } + + lhs := lcsRec(as[:na-1], bs, m) + rhs := lcsRec(as, bs[:nb-1], m) + if len(lhs) >= len(rhs) { + m[[2]int{na, nb}] = lhs + return lhs + } else { + m[[2]int{na, nb}] = rhs + return rhs + } +} + +// EditOp is the opcode of an edit sequence instruction. +type EditOp byte + +const ( + OpDelete EditOp = '-' // Delete items from lhs + OpInsert EditOp = '+' // Insert items from rhs + OpCopy EditOp = '=' // Copy elements from lhs + OpReplace EditOp = 'x' // Replace with items from rhs +) + +// Edit is an edit operation transforming specified as part of a diff. +// Each edit refers to a specific span of one of the inputs. +type Edit struct { + Op EditOp // the diff operation to apply at the current offset + + // N specifies the number of inputs affected by the operation. + N int + + // X specifies an additionl argument affected by the operation: + // + // For OpDelete and OpCopy, X is not used and will be 0. + // For OpInsert and OpReplace, X specifies a starting offset in rhs from + // which values are to be copied. + X int +} + +func (e Edit) String() string { + if e.Op == OpInsert || e.Op == OpReplace { + return fmt.Sprintf("%c%d:%d", e.Op, e.N, e.X) + } + return fmt.Sprintf("%c%d", e.Op, e.N) +} + +// EditScript computes a minimal-length sequence of Edit operations that will +// transform lhs into rhs when applied. +// +// An edit sequence is processed in order starting at offset 0 of lhs. Items +// are sent to the output according to the following rules. +// +// For each element e of the edit script, if e.Op is: +// +// - OpDelete: advance the offset by e.N (no output) +// - OpInsert: output e.N elements from rhs at position e.X +// - OpCopy: output e.N elements from lhs at the current offset, and advance +// the offset by e.N positions +// - OpReplace: output e.N elements from rhs at position e.X, and advance the +// offset by e.N positions +// +// After all edits are processed, output any remaining elements of lhs. This +// completes the processing of the script. +func EditScript[T comparable, Slice ~[]T](lhs, rhs Slice) []Edit { + lcs := LCS(lhs, rhs) + + // To construct the edit sequence, i scans forward through lcs. + // For each i, we find the unclaimed elements of lhs and rhs prior to the + // occurrence of lcs[i]. + // + // Elements of lhs before lcs[i] must be deleted from the result. + // Elements of rhs before lcs[i] must be inserted into the result. + // Elements equal to lcs members are copied. + // + // However, whenever we have deletes followed immediately by inserts, the + // net effect is to "replace" some or all of the deleted items with the + // inserted ones. We represent this case explicitly with a replace edit. + lpos, rpos, i := 0, 0, 0 + + var out []Edit + for i < len(lcs) { + // Count the numbers of elements of lhs and rhs prior to the first match. + lend := lpos + for lhs[lend] != lcs[i] { + lend++ + } + rend := rpos + for rhs[rend] != lcs[i] { + rend++ + } + + // Add exchanges for overlapping delete + insert pairs. + if x := min(lend-lpos, rend-rpos); x > 0 { + out = append(out, Edit{Op: OpReplace, N: x, X: rpos}) + lpos += x + rpos += x + } + + // Record any remaining unpaired deletions and insertions. + // Note deletions need to go first. + if lend > lpos { + out = append(out, Edit{Op: OpDelete, N: lend - lpos}) + } + if rend > rpos { + out = append(out, Edit{Op: OpInsert, N: rend - rpos, X: rpos}) + } + + lpos, rpos = lend, rend + + // Reaching here, lhs[lpos] == rhs[rpos] == lcs[i]. + // Count how many elements are equal and copy them. + m := 1 + for i+m < len(lcs) && lhs[lpos+m] == rhs[rpos+m] { + m++ + } + i += m + lpos += m + rpos += m + out = append(out, Edit{Op: OpCopy, N: m}) + } + + // Add exchanges for overlapping delete + insert pairs. + if x := min(len(lhs)-lpos, len(rhs)-rpos); x > 0 { + out = append(out, Edit{Op: OpReplace, N: x, X: rpos}) + lpos += x + rpos += x + } + // Delete any leftover elements of lhs. + if n := len(lhs) - lpos; n > 0 { + out = append(out, Edit{Op: OpDelete, N: n}) + } + // Insert any leftover elements of rhs. + if n := len(rhs) - rpos; n > 0 { + out = append(out, Edit{Op: OpInsert, N: n, X: rpos}) + } + if n := len(out); n > 0 && out[n-1].Op == OpCopy { + return out[:n-1] + } + return out +} diff --git a/slice/edit_test.go b/slice/edit_test.go new file mode 100644 index 0000000..ae55692 --- /dev/null +++ b/slice/edit_test.go @@ -0,0 +1,194 @@ +package slice_test + +import ( + "math/rand" + "slices" + "strconv" + "strings" + "testing" + + "github.com/creachadair/mds/slice" +) + +func TestLCS(t *testing.T) { + tests := []struct { + a, b string + want string + }{ + {"", "", ""}, + + {"a", "", ""}, + {"", "b", ""}, + + {"a b c", "", ""}, + {"", "d e f", ""}, + + {"a", "a b c", "a"}, + {"b", "a b c", "b"}, + {"c", "a b c", "c"}, + {"d", "a b c", ""}, + + {"a b c", "a b c", "a b c"}, + {"a b c", "a b", "a b"}, + {"b c", "a b c", "b c"}, + + {"you will be lucky to get this to work at all", + "will we be so lucky as to get this to work in the end", + "will be lucky to get this to work"}, + + {"a foolish consistency is the hobgoblin of little minds", + "four foolish fat hens ate the hobgoblin who is little and minds not", + "foolish the hobgoblin little minds"}, + } + for _, tc := range tests { + as, bs := strings.Fields(tc.a), strings.Fields(tc.b) + want := strings.Fields(tc.want) + got := slice.LCS(as, bs) + if !slices.Equal(got, want) { + t.Errorf("LCS(%s, %s):\ngot: %v\nwant: %v", tc.a, tc.b, got, want) + } + } +} + +func TestLCSRandom(t *testing.T) { + // Append n randomly generated letters from alpha to *ss. + pad := func(ss *[]string, n int, alpha string) { + for i := 0; i < n; i++ { + j := rand.Intn(len(alpha)) + *ss = append(*ss, alpha[j:j+1]) + } + } + + // Append 0-4 randomly generated letters from alpha before and after each + // word in want, and return the resulting sequence. + input := func(want []string, alpha string) []string { + var out []string + for _, w := range want { + pad(&out, rand.Intn(4), alpha) + out = append(out, w) + } + pad(&out, rand.Intn(4), alpha) + return out + } + + // Generate a longest common subsequence of length i, and inputs constructed + // to have that as their LCS, and verify that they do. + for i := 0; i < 200; i += 20 { + var want []string + pad(&want, i, "abcdefghijklmonpqrstuvwxyz") + + // N.B. The alphabets used by the probe string must not overlap with the + // inputs, nor the inputs with each other. + // + // Probe string: lower-case + // LHS: digits + // RHS: upper-case + + lhs := input(want, "0123456789") + rhs := input(want, "ABCDEFGHIJKLMNOPQRSTUVWXYZ") + got := slice.LCS(lhs, rhs) + if !slices.Equal(got, want) { + t.Errorf("LCS(%q, %q):\ngot: %q\nwant: %q", lhs, rhs, got, want) + } + } +} + +func TestEditScript(t *testing.T) { + tests := []struct { + a, b string + want []slice.Edit + }{ + {"", "", nil}, + + {"a", "", pedit(t, "-1")}, + {"", "b", pedit(t, "+1:0")}, + + {"a b c", "", pedit(t, "-3")}, + {"", "d e f", pedit(t, "+3:0")}, + + {"a", "a b c", pedit(t, "=1 +2:1")}, + {"b", "a b c", pedit(t, "+1:0 =1 +1:2")}, + {"c", "a b c", pedit(t, "+2:0")}, + {"d", "a b c", pedit(t, "x1:0 +2:1")}, + + {"a b c", "a b c", pedit(t, "")}, + {"a b c", "a x c", pedit(t, "=1 x1:1")}, + {"a b c", "a b", pedit(t, "=2 -1")}, + {"b c", "a b c", pedit(t, "+1:0")}, + + {"a x b x c", "1 x b x 2", pedit(t, "x1:0 =3 x1:4")}, + {"fly you fools", "to fly you must not be fools", pedit(t, "+1:0 =2 +3:3")}, + + {"have the best time it is possible to have under the circumstances", + "I hope you have the time of your life in the forest", + pedit(t, "+3:0 =2 -1 =1 x4:6 -2 =1 x1:11"), + }, + } + for _, tc := range tests { + as, bs := strings.Fields(tc.a), strings.Fields(tc.b) + got := slice.EditScript(as, bs) + if !slices.Equal(got, tc.want) { + t.Errorf("EditScript(%q, %q):\ngot: %v\nwant: %v", tc.a, tc.b, got, tc.want) + } + checkApply(t, as, bs, got) + } +} + +// checkApply verifies that applying the specified edit script to lhs produces rhs. +func checkApply[T comparable, Slice ~[]T](t *testing.T, lhs, rhs Slice, edit []slice.Edit) { + t.Helper() + + var out Slice + i := 0 + for _, e := range edit { + switch e.Op { + case slice.OpDelete: + i += e.N + case slice.OpInsert: + out = append(out, rhs[e.X:e.X+e.N]...) + case slice.OpCopy: + out = append(out, lhs[i:i+e.N]...) + i += e.N + case slice.OpReplace: + out = append(out, rhs[e.X:e.X+e.N]...) + i += e.N + default: + t.Fatalf("Unexpected edit operation: %v", e) + } + } + out = append(out, lhs[i:]...) + if !slices.Equal(out, rhs) { + t.Errorf("Apply %v:\ngot: %v\nwant: %v", edit, out, rhs) + } else { + t.Logf("Apply L %v E %v OK: %v", lhs, edit, out) + } +} + +// pedit parses a string of space-separated edit strings matching the string +// format rendered by the String method of a slice.Edit. +func pedit(t *testing.T, ss string) (out []slice.Edit) { + t.Helper() + for _, s := range strings.Fields(ss) { + var next slice.Edit + switch s[0] { + case '-', '=', '+', 'x': + next.Op = slice.EditOp(s[0]) + default: + t.Fatalf("Invalid edit op: %c", s[0]) + } + var err error + fst, snd, ok := strings.Cut(s[1:], ":") + next.N, err = strconv.Atoi(fst) + if err != nil { + t.Fatalf("Invalid N: %v", err) + } + if ok { + next.X, err = strconv.Atoi(snd) + if err != nil { + t.Fatalf("Invalid X: %v", err) + } + } + out = append(out, next) + } + return +}