Skip to content

Commit

Permalink
slice: add LCS and EditScript
Browse files Browse the repository at this point in the history
Given comparable slices A, B:

- LCS(A, B) computes a longest common sequence of A and B
- EditScript(A, B) computes a minimal edit script from A to B
  • Loading branch information
creachadair committed Feb 17, 2024
1 parent 507f589 commit f189538
Show file tree
Hide file tree
Showing 2 changed files with 357 additions and 0 deletions.
163 changes: 163 additions & 0 deletions slice/edit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
package slice

import (
"fmt"
)

// LCS computes a longest common subsequence of as and bs.
func LCS[T comparable, Slice ~[]T](as, bs Slice) Slice {
return lcsRec(as, bs, make(map[[2]int]Slice))
}

func lcsRec[T comparable, Slice ~[]T](as, bs Slice, m map[[2]int]Slice) Slice {
if len(as) == 0 || len(bs) == 0 {
return nil
}
na, nb := len(as), len(bs)
if v, ok := m[[2]int{na, nb}]; ok {
return v
}
if as[na-1] == bs[nb-1] {
ans := append(lcsRec(as[:na-1], bs[:nb-1], m), as[na-1])
m[[2]int{na, nb}] = ans
return ans
}

lhs := lcsRec(as[:na-1], bs, m)
rhs := lcsRec(as, bs[:nb-1], m)
if len(lhs) >= len(rhs) {
m[[2]int{na, nb}] = lhs
return lhs
} else {
m[[2]int{na, nb}] = rhs
return rhs
}
}

// EditOp is the opcode of an edit sequence instruction.
type EditOp byte

const (
OpDelete EditOp = '-' // Delete items from lhs
OpInsert EditOp = '+' // Insert items from rhs
OpCopy EditOp = '=' // Copy elements from lhs
OpReplace EditOp = 'x' // Replace with items from rhs
)

// Edit is an edit operation transforming specified as part of a diff.
// Each edit refers to a specific span of one of the inputs.
type Edit struct {
Op EditOp // the diff operation to apply at the current offset

// N specifies the number of inputs affected by the operation.
N int

// X specifies an additionl argument affected by the operation:
//
// For OpDelete and OpCopy, X is not used and will be 0.
// For OpInsert and OpReplace, X specifies a starting offset in rhs from
// which values are to be copied.
X int
}

func (e Edit) String() string {
if e.Op == OpInsert || e.Op == OpReplace {
return fmt.Sprintf("%c%d:%d", e.Op, e.N, e.X)
}
return fmt.Sprintf("%c%d", e.Op, e.N)
}

// EditScript computes a minimal-length sequence of Edit operations that will
// transform lhs into rhs when applied.
//
// An edit sequence is processed in order starting at offset 0 of lhs. Items
// are sent to the output according to the following rules.
//
// For each element e of the edit script, if e.Op is:
//
// - OpDelete: advance the offset by e.N (no output)
// - OpInsert: output e.N elements from rhs at position e.X
// - OpCopy: output e.N elements from lhs at the current offset, and advance
// the offset by e.N positions
// - OpReplace: output e.N elements from rhs at position e.X, and advance the
// offset by e.N positions
//
// After all edits are processed, output any remaining elements of lhs. This
// completes the processing of the script.
func EditScript[T comparable, Slice ~[]T](lhs, rhs Slice) []Edit {
lcs := LCS(lhs, rhs)

// To construct the edit sequence, i scans forward through lcs.
// For each i, we find the unclaimed elements of lhs and rhs prior to the
// occurrence of lcs[i].
//
// Elements of lhs before lcs[i] must be deleted from the result.
// Elements of rhs before lcs[i] must be inserted into the result.
// Elements equal to lcs members are copied.
//
// However, whenever we have deletes followed immediately by inserts, the
// net effect is to "replace" some or all of the deleted items with the
// inserted ones. We represent this case explicitly with a replace edit.
lpos, rpos, i := 0, 0, 0

var out []Edit
for i < len(lcs) {
// Count the numbers of elements of lhs and rhs prior to the first match.
lend := lpos
for lhs[lend] != lcs[i] {
lend++
}
rend := rpos
for rhs[rend] != lcs[i] {
rend++
}

// Add exchanges for overlapping delete + insert pairs.
if x := min(lend-lpos, rend-rpos); x > 0 {
out = append(out, Edit{Op: OpReplace, N: x, X: rpos})
lpos += x
rpos += x
}

// Record any remaining unpaired deletions and insertions.
// Note deletions need to go first.
if lend > lpos {
out = append(out, Edit{Op: OpDelete, N: lend - lpos})
}
if rend > rpos {
out = append(out, Edit{Op: OpInsert, N: rend - rpos, X: rpos})
}

lpos, rpos = lend, rend

// Reaching here, lhs[lpos] == rhs[rpos] == lcs[i].
// Count how many elements are equal and copy them.
m := 1
for i+m < len(lcs) && lhs[lpos+m] == rhs[rpos+m] {
m++
}
i += m
lpos += m
rpos += m
out = append(out, Edit{Op: OpCopy, N: m})
}

// Add exchanges for overlapping delete + insert pairs.
if x := min(len(lhs)-lpos, len(rhs)-rpos); x > 0 {
out = append(out, Edit{Op: OpReplace, N: x, X: rpos})
lpos += x
rpos += x
}
// Delete any leftover elements of lhs.
if n := len(lhs) - lpos; n > 0 {
out = append(out, Edit{Op: OpDelete, N: n})
}
// Insert any leftover elements of rhs.
if n := len(rhs) - rpos; n > 0 {
out = append(out, Edit{Op: OpInsert, N: n, X: rpos})
}
if n := len(out); n > 0 && out[n-1].Op == OpCopy {
return out[:n-1]
}
return out
}
194 changes: 194 additions & 0 deletions slice/edit_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package slice_test

import (
"math/rand"
"slices"
"strconv"
"strings"
"testing"

"github.com/creachadair/mds/slice"
)

func TestLCS(t *testing.T) {
tests := []struct {
a, b string
want string
}{
{"", "", ""},

{"a", "", ""},
{"", "b", ""},

{"a b c", "", ""},
{"", "d e f", ""},

{"a", "a b c", "a"},
{"b", "a b c", "b"},
{"c", "a b c", "c"},
{"d", "a b c", ""},

{"a b c", "a b c", "a b c"},
{"a b c", "a b", "a b"},
{"b c", "a b c", "b c"},

{"you will be lucky to get this to work at all",
"will we be so lucky as to get this to work in the end",
"will be lucky to get this to work"},

{"a foolish consistency is the hobgoblin of little minds",
"four foolish fat hens ate the hobgoblin who is little and minds not",
"foolish the hobgoblin little minds"},
}
for _, tc := range tests {
as, bs := strings.Fields(tc.a), strings.Fields(tc.b)
want := strings.Fields(tc.want)
got := slice.LCS(as, bs)
if !slices.Equal(got, want) {
t.Errorf("LCS(%s, %s):\ngot: %v\nwant: %v", tc.a, tc.b, got, want)
}
}
}

func TestLCSRandom(t *testing.T) {
// Append n randomly generated letters from alpha to *ss.
pad := func(ss *[]string, n int, alpha string) {
for i := 0; i < n; i++ {
j := rand.Intn(len(alpha))
*ss = append(*ss, alpha[j:j+1])
}
}

// Append 0-4 randomly generated letters from alpha before and after each
// word in want, and return the resulting sequence.
input := func(want []string, alpha string) []string {
var out []string
for _, w := range want {
pad(&out, rand.Intn(4), alpha)
out = append(out, w)
}
pad(&out, rand.Intn(4), alpha)
return out
}

// Generate a longest common subsequence of length i, and inputs constructed
// to have that as their LCS, and verify that they do.
for i := 0; i < 200; i += 20 {
var want []string
pad(&want, i, "abcdefghijklmonpqrstuvwxyz")

// N.B. The alphabets used by the probe string must not overlap with the
// inputs, nor the inputs with each other.
//
// Probe string: lower-case
// LHS: digits
// RHS: upper-case

lhs := input(want, "0123456789")
rhs := input(want, "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
got := slice.LCS(lhs, rhs)
if !slices.Equal(got, want) {
t.Errorf("LCS(%q, %q):\ngot: %q\nwant: %q", lhs, rhs, got, want)
}
}
}

func TestEditScript(t *testing.T) {
tests := []struct {
a, b string
want []slice.Edit
}{
{"", "", nil},

{"a", "", pedit(t, "-1")},
{"", "b", pedit(t, "+1:0")},

{"a b c", "", pedit(t, "-3")},
{"", "d e f", pedit(t, "+3:0")},

{"a", "a b c", pedit(t, "=1 +2:1")},
{"b", "a b c", pedit(t, "+1:0 =1 +1:2")},
{"c", "a b c", pedit(t, "+2:0")},
{"d", "a b c", pedit(t, "x1:0 +2:1")},

{"a b c", "a b c", pedit(t, "")},
{"a b c", "a x c", pedit(t, "=1 x1:1")},
{"a b c", "a b", pedit(t, "=2 -1")},
{"b c", "a b c", pedit(t, "+1:0")},

{"a x b x c", "1 x b x 2", pedit(t, "x1:0 =3 x1:4")},
{"fly you fools", "to fly you must not be fools", pedit(t, "+1:0 =2 +3:3")},

{"have the best time it is possible to have under the circumstances",
"I hope you have the time of your life in the forest",
pedit(t, "+3:0 =2 -1 =1 x4:6 -2 =1 x1:11"),
},
}
for _, tc := range tests {
as, bs := strings.Fields(tc.a), strings.Fields(tc.b)
got := slice.EditScript(as, bs)
if !slices.Equal(got, tc.want) {
t.Errorf("EditScript(%q, %q):\ngot: %v\nwant: %v", tc.a, tc.b, got, tc.want)
}
checkApply(t, as, bs, got)
}
}

// checkApply verifies that applying the specified edit script to lhs produces rhs.
func checkApply[T comparable, Slice ~[]T](t *testing.T, lhs, rhs Slice, edit []slice.Edit) {
t.Helper()

var out Slice
i := 0
for _, e := range edit {
switch e.Op {
case slice.OpDelete:
i += e.N
case slice.OpInsert:
out = append(out, rhs[e.X:e.X+e.N]...)
case slice.OpCopy:
out = append(out, lhs[i:i+e.N]...)
i += e.N
case slice.OpReplace:
out = append(out, rhs[e.X:e.X+e.N]...)
i += e.N
default:
t.Fatalf("Unexpected edit operation: %v", e)
}
}
out = append(out, lhs[i:]...)
if !slices.Equal(out, rhs) {
t.Errorf("Apply %v:\ngot: %v\nwant: %v", edit, out, rhs)
} else {
t.Logf("Apply L %v E %v OK: %v", lhs, edit, out)
}
}

// pedit parses a string of space-separated edit strings matching the string
// format rendered by the String method of a slice.Edit.
func pedit(t *testing.T, ss string) (out []slice.Edit) {
t.Helper()
for _, s := range strings.Fields(ss) {
var next slice.Edit
switch s[0] {
case '-', '=', '+', 'x':
next.Op = slice.EditOp(s[0])
default:
t.Fatalf("Invalid edit op: %c", s[0])
}
var err error
fst, snd, ok := strings.Cut(s[1:], ":")
next.N, err = strconv.Atoi(fst)
if err != nil {
t.Fatalf("Invalid N: %v", err)
}
if ok {
next.X, err = strconv.Atoi(snd)
if err != nil {
t.Fatalf("Invalid X: %v", err)
}
}
out = append(out, next)
}
return
}

0 comments on commit f189538

Please sign in to comment.