-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Given comparable slices A, B: - LCS(A, B) computes a longest common sequence of A and B - EditScript(A, B) computes a minimal edit script from A to B
- Loading branch information
1 parent
507f589
commit f189538
Showing
2 changed files
with
357 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
package slice | ||
|
||
import ( | ||
"fmt" | ||
) | ||
|
||
// LCS computes a longest common subsequence of as and bs. | ||
func LCS[T comparable, Slice ~[]T](as, bs Slice) Slice { | ||
return lcsRec(as, bs, make(map[[2]int]Slice)) | ||
} | ||
|
||
func lcsRec[T comparable, Slice ~[]T](as, bs Slice, m map[[2]int]Slice) Slice { | ||
if len(as) == 0 || len(bs) == 0 { | ||
return nil | ||
} | ||
na, nb := len(as), len(bs) | ||
if v, ok := m[[2]int{na, nb}]; ok { | ||
return v | ||
} | ||
if as[na-1] == bs[nb-1] { | ||
ans := append(lcsRec(as[:na-1], bs[:nb-1], m), as[na-1]) | ||
m[[2]int{na, nb}] = ans | ||
return ans | ||
} | ||
|
||
lhs := lcsRec(as[:na-1], bs, m) | ||
rhs := lcsRec(as, bs[:nb-1], m) | ||
if len(lhs) >= len(rhs) { | ||
m[[2]int{na, nb}] = lhs | ||
return lhs | ||
} else { | ||
m[[2]int{na, nb}] = rhs | ||
return rhs | ||
} | ||
} | ||
|
||
// EditOp is the opcode of an edit sequence instruction. | ||
type EditOp byte | ||
|
||
const ( | ||
OpDelete EditOp = '-' // Delete items from lhs | ||
OpInsert EditOp = '+' // Insert items from rhs | ||
OpCopy EditOp = '=' // Copy elements from lhs | ||
OpReplace EditOp = 'x' // Replace with items from rhs | ||
) | ||
|
||
// Edit is an edit operation transforming specified as part of a diff. | ||
// Each edit refers to a specific span of one of the inputs. | ||
type Edit struct { | ||
Op EditOp // the diff operation to apply at the current offset | ||
|
||
// N specifies the number of inputs affected by the operation. | ||
N int | ||
|
||
// X specifies an additionl argument affected by the operation: | ||
// | ||
// For OpDelete and OpCopy, X is not used and will be 0. | ||
// For OpInsert and OpReplace, X specifies a starting offset in rhs from | ||
// which values are to be copied. | ||
X int | ||
} | ||
|
||
func (e Edit) String() string { | ||
if e.Op == OpInsert || e.Op == OpReplace { | ||
return fmt.Sprintf("%c%d:%d", e.Op, e.N, e.X) | ||
} | ||
return fmt.Sprintf("%c%d", e.Op, e.N) | ||
} | ||
|
||
// EditScript computes a minimal-length sequence of Edit operations that will | ||
// transform lhs into rhs when applied. | ||
// | ||
// An edit sequence is processed in order starting at offset 0 of lhs. Items | ||
// are sent to the output according to the following rules. | ||
// | ||
// For each element e of the edit script, if e.Op is: | ||
// | ||
// - OpDelete: advance the offset by e.N (no output) | ||
// - OpInsert: output e.N elements from rhs at position e.X | ||
// - OpCopy: output e.N elements from lhs at the current offset, and advance | ||
// the offset by e.N positions | ||
// - OpReplace: output e.N elements from rhs at position e.X, and advance the | ||
// offset by e.N positions | ||
// | ||
// After all edits are processed, output any remaining elements of lhs. This | ||
// completes the processing of the script. | ||
func EditScript[T comparable, Slice ~[]T](lhs, rhs Slice) []Edit { | ||
lcs := LCS(lhs, rhs) | ||
|
||
// To construct the edit sequence, i scans forward through lcs. | ||
// For each i, we find the unclaimed elements of lhs and rhs prior to the | ||
// occurrence of lcs[i]. | ||
// | ||
// Elements of lhs before lcs[i] must be deleted from the result. | ||
// Elements of rhs before lcs[i] must be inserted into the result. | ||
// Elements equal to lcs members are copied. | ||
// | ||
// However, whenever we have deletes followed immediately by inserts, the | ||
// net effect is to "replace" some or all of the deleted items with the | ||
// inserted ones. We represent this case explicitly with a replace edit. | ||
lpos, rpos, i := 0, 0, 0 | ||
|
||
var out []Edit | ||
for i < len(lcs) { | ||
// Count the numbers of elements of lhs and rhs prior to the first match. | ||
lend := lpos | ||
for lhs[lend] != lcs[i] { | ||
lend++ | ||
} | ||
rend := rpos | ||
for rhs[rend] != lcs[i] { | ||
rend++ | ||
} | ||
|
||
// Add exchanges for overlapping delete + insert pairs. | ||
if x := min(lend-lpos, rend-rpos); x > 0 { | ||
out = append(out, Edit{Op: OpReplace, N: x, X: rpos}) | ||
lpos += x | ||
rpos += x | ||
} | ||
|
||
// Record any remaining unpaired deletions and insertions. | ||
// Note deletions need to go first. | ||
if lend > lpos { | ||
out = append(out, Edit{Op: OpDelete, N: lend - lpos}) | ||
} | ||
if rend > rpos { | ||
out = append(out, Edit{Op: OpInsert, N: rend - rpos, X: rpos}) | ||
} | ||
|
||
lpos, rpos = lend, rend | ||
|
||
// Reaching here, lhs[lpos] == rhs[rpos] == lcs[i]. | ||
// Count how many elements are equal and copy them. | ||
m := 1 | ||
for i+m < len(lcs) && lhs[lpos+m] == rhs[rpos+m] { | ||
m++ | ||
} | ||
i += m | ||
lpos += m | ||
rpos += m | ||
out = append(out, Edit{Op: OpCopy, N: m}) | ||
} | ||
|
||
// Add exchanges for overlapping delete + insert pairs. | ||
if x := min(len(lhs)-lpos, len(rhs)-rpos); x > 0 { | ||
out = append(out, Edit{Op: OpReplace, N: x, X: rpos}) | ||
lpos += x | ||
rpos += x | ||
} | ||
// Delete any leftover elements of lhs. | ||
if n := len(lhs) - lpos; n > 0 { | ||
out = append(out, Edit{Op: OpDelete, N: n}) | ||
} | ||
// Insert any leftover elements of rhs. | ||
if n := len(rhs) - rpos; n > 0 { | ||
out = append(out, Edit{Op: OpInsert, N: n, X: rpos}) | ||
} | ||
if n := len(out); n > 0 && out[n-1].Op == OpCopy { | ||
return out[:n-1] | ||
} | ||
return out | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
package slice_test | ||
|
||
import ( | ||
"math/rand" | ||
"slices" | ||
"strconv" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/creachadair/mds/slice" | ||
) | ||
|
||
func TestLCS(t *testing.T) { | ||
tests := []struct { | ||
a, b string | ||
want string | ||
}{ | ||
{"", "", ""}, | ||
|
||
{"a", "", ""}, | ||
{"", "b", ""}, | ||
|
||
{"a b c", "", ""}, | ||
{"", "d e f", ""}, | ||
|
||
{"a", "a b c", "a"}, | ||
{"b", "a b c", "b"}, | ||
{"c", "a b c", "c"}, | ||
{"d", "a b c", ""}, | ||
|
||
{"a b c", "a b c", "a b c"}, | ||
{"a b c", "a b", "a b"}, | ||
{"b c", "a b c", "b c"}, | ||
|
||
{"you will be lucky to get this to work at all", | ||
"will we be so lucky as to get this to work in the end", | ||
"will be lucky to get this to work"}, | ||
|
||
{"a foolish consistency is the hobgoblin of little minds", | ||
"four foolish fat hens ate the hobgoblin who is little and minds not", | ||
"foolish the hobgoblin little minds"}, | ||
} | ||
for _, tc := range tests { | ||
as, bs := strings.Fields(tc.a), strings.Fields(tc.b) | ||
want := strings.Fields(tc.want) | ||
got := slice.LCS(as, bs) | ||
if !slices.Equal(got, want) { | ||
t.Errorf("LCS(%s, %s):\ngot: %v\nwant: %v", tc.a, tc.b, got, want) | ||
} | ||
} | ||
} | ||
|
||
func TestLCSRandom(t *testing.T) { | ||
// Append n randomly generated letters from alpha to *ss. | ||
pad := func(ss *[]string, n int, alpha string) { | ||
for i := 0; i < n; i++ { | ||
j := rand.Intn(len(alpha)) | ||
*ss = append(*ss, alpha[j:j+1]) | ||
} | ||
} | ||
|
||
// Append 0-4 randomly generated letters from alpha before and after each | ||
// word in want, and return the resulting sequence. | ||
input := func(want []string, alpha string) []string { | ||
var out []string | ||
for _, w := range want { | ||
pad(&out, rand.Intn(4), alpha) | ||
out = append(out, w) | ||
} | ||
pad(&out, rand.Intn(4), alpha) | ||
return out | ||
} | ||
|
||
// Generate a longest common subsequence of length i, and inputs constructed | ||
// to have that as their LCS, and verify that they do. | ||
for i := 0; i < 200; i += 20 { | ||
var want []string | ||
pad(&want, i, "abcdefghijklmonpqrstuvwxyz") | ||
|
||
// N.B. The alphabets used by the probe string must not overlap with the | ||
// inputs, nor the inputs with each other. | ||
// | ||
// Probe string: lower-case | ||
// LHS: digits | ||
// RHS: upper-case | ||
|
||
lhs := input(want, "0123456789") | ||
rhs := input(want, "ABCDEFGHIJKLMNOPQRSTUVWXYZ") | ||
got := slice.LCS(lhs, rhs) | ||
if !slices.Equal(got, want) { | ||
t.Errorf("LCS(%q, %q):\ngot: %q\nwant: %q", lhs, rhs, got, want) | ||
} | ||
} | ||
} | ||
|
||
func TestEditScript(t *testing.T) { | ||
tests := []struct { | ||
a, b string | ||
want []slice.Edit | ||
}{ | ||
{"", "", nil}, | ||
|
||
{"a", "", pedit(t, "-1")}, | ||
{"", "b", pedit(t, "+1:0")}, | ||
|
||
{"a b c", "", pedit(t, "-3")}, | ||
{"", "d e f", pedit(t, "+3:0")}, | ||
|
||
{"a", "a b c", pedit(t, "=1 +2:1")}, | ||
{"b", "a b c", pedit(t, "+1:0 =1 +1:2")}, | ||
{"c", "a b c", pedit(t, "+2:0")}, | ||
{"d", "a b c", pedit(t, "x1:0 +2:1")}, | ||
|
||
{"a b c", "a b c", pedit(t, "")}, | ||
{"a b c", "a x c", pedit(t, "=1 x1:1")}, | ||
{"a b c", "a b", pedit(t, "=2 -1")}, | ||
{"b c", "a b c", pedit(t, "+1:0")}, | ||
|
||
{"a x b x c", "1 x b x 2", pedit(t, "x1:0 =3 x1:4")}, | ||
{"fly you fools", "to fly you must not be fools", pedit(t, "+1:0 =2 +3:3")}, | ||
|
||
{"have the best time it is possible to have under the circumstances", | ||
"I hope you have the time of your life in the forest", | ||
pedit(t, "+3:0 =2 -1 =1 x4:6 -2 =1 x1:11"), | ||
}, | ||
} | ||
for _, tc := range tests { | ||
as, bs := strings.Fields(tc.a), strings.Fields(tc.b) | ||
got := slice.EditScript(as, bs) | ||
if !slices.Equal(got, tc.want) { | ||
t.Errorf("EditScript(%q, %q):\ngot: %v\nwant: %v", tc.a, tc.b, got, tc.want) | ||
} | ||
checkApply(t, as, bs, got) | ||
} | ||
} | ||
|
||
// checkApply verifies that applying the specified edit script to lhs produces rhs. | ||
func checkApply[T comparable, Slice ~[]T](t *testing.T, lhs, rhs Slice, edit []slice.Edit) { | ||
t.Helper() | ||
|
||
var out Slice | ||
i := 0 | ||
for _, e := range edit { | ||
switch e.Op { | ||
case slice.OpDelete: | ||
i += e.N | ||
case slice.OpInsert: | ||
out = append(out, rhs[e.X:e.X+e.N]...) | ||
case slice.OpCopy: | ||
out = append(out, lhs[i:i+e.N]...) | ||
i += e.N | ||
case slice.OpReplace: | ||
out = append(out, rhs[e.X:e.X+e.N]...) | ||
i += e.N | ||
default: | ||
t.Fatalf("Unexpected edit operation: %v", e) | ||
} | ||
} | ||
out = append(out, lhs[i:]...) | ||
if !slices.Equal(out, rhs) { | ||
t.Errorf("Apply %v:\ngot: %v\nwant: %v", edit, out, rhs) | ||
} else { | ||
t.Logf("Apply L %v E %v OK: %v", lhs, edit, out) | ||
} | ||
} | ||
|
||
// pedit parses a string of space-separated edit strings matching the string | ||
// format rendered by the String method of a slice.Edit. | ||
func pedit(t *testing.T, ss string) (out []slice.Edit) { | ||
t.Helper() | ||
for _, s := range strings.Fields(ss) { | ||
var next slice.Edit | ||
switch s[0] { | ||
case '-', '=', '+', 'x': | ||
next.Op = slice.EditOp(s[0]) | ||
default: | ||
t.Fatalf("Invalid edit op: %c", s[0]) | ||
} | ||
var err error | ||
fst, snd, ok := strings.Cut(s[1:], ":") | ||
next.N, err = strconv.Atoi(fst) | ||
if err != nil { | ||
t.Fatalf("Invalid N: %v", err) | ||
} | ||
if ok { | ||
next.X, err = strconv.Atoi(snd) | ||
if err != nil { | ||
t.Fatalf("Invalid X: %v", err) | ||
} | ||
} | ||
out = append(out, next) | ||
} | ||
return | ||
} |