From fac16fee5348a9e31bde6afdf8a00218dcc63941 Mon Sep 17 00:00:00 2001
From: psadac <paulsadac@gmail.com>
Date: Mon, 23 Sep 2024 21:07:30 +0200
Subject: [PATCH] Add tests on long strings with few different characters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Benchmarks are run before and after optimization "Remove leading and trailing identical runes".
Long strings with differences at the beginning (long_lead), in the middle (long_middle) or at the end (long_trail) show significant improvements in processing time and memory allocations. When the optimization is ineffective due to different leading and trailing characters (long_diff) there is no change in processing time or memory allocation.

goos: linux
goarch: amd64
pkg: github.com/agnivade/levenshtein
cpu: AMD Ryzen 7 7840U w/ Radeon  780M Graphics
                      │  before.txt  │              after.txt              │
                      │    sec/op    │   sec/op     vs base                │
Simple/ASCII-16         134.20n ± 0%   79.03n ± 0%  -41.11% (p=0.000 n=20)
Simple/French-16         254.8n ± 0%   129.7n ± 0%  -49.09% (p=0.000 n=20)
Simple/Nordic-16         500.6n ± 1%   208.0n ± 0%  -58.45% (p=0.000 n=20)
Simple/Long_lead-16     1862.0n ± 0%   209.6n ± 1%  -88.75% (p=0.000 n=20)
Simple/Long_middle-16   3613.0n ± 0%   325.0n ± 0%  -91.00% (p=0.000 n=20)
Simple/Long_trail-16    3911.0n ± 0%   399.0n ± 1%  -89.80% (p=0.000 n=20)
Simple/Long_diff-16      4.030µ ± 0%   4.029µ ± 1%        ~ (p=0.899 n=20)
Simple/Tibetan-16        413.0n ± 0%   277.3n ± 0%  -32.86% (p=0.000 n=20)
geomean                  964.6n        299.5n       -68.95%

                      │  before.txt  │              after.txt               │
                      │     B/op     │    B/op     vs base                  │
Simple/ASCII-16         0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/French-16        0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/Nordic-16        0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/Long_lead-16     464.0 ± 0%     368.0 ± 0%  -20.69% (p=0.000 n=20)
Simple/Long_middle-16   672.0 ± 0%     544.0 ± 0%  -19.05% (p=0.000 n=20)
Simple/Long_trail-16    720.0 ± 0%     576.0 ± 0%  -20.00% (p=0.000 n=20)
Simple/Long_diff-16     720.0 ± 0%     720.0 ± 0%        ~ (p=1.000 n=20) ¹
Simple/Tibetan-16       0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
geomean                            ²                -7.99%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean

                      │  before.txt  │              after.txt               │
                      │  allocs/op   │ allocs/op   vs base                  │
Simple/ASCII-16         0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/French-16        0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/Nordic-16        0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/Long_lead-16     3.000 ± 0%     2.000 ± 0%  -33.33% (p=0.000 n=20)
Simple/Long_middle-16   3.000 ± 0%     2.000 ± 0%  -33.33% (p=0.000 n=20)
Simple/Long_trail-16    3.000 ± 0%     2.000 ± 0%  -33.33% (p=0.000 n=20)
Simple/Long_diff-16     3.000 ± 0%     3.000 ± 0%        ~ (p=1.000 n=20) ¹
Simple/Tibetan-16       0.000 ± 0%     0.000 ± 0%        ~ (p=1.000 n=20) ¹
geomean                            ²               -14.11%                ²
¹ all samples are equal
² summaries must be >0 to compute geomean
---
 levenshtein_test.go | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/levenshtein_test.go b/levenshtein_test.go
index dd296d3..dd3607a 100644
--- a/levenshtein_test.go
+++ b/levenshtein_test.go
@@ -66,13 +66,35 @@ func BenchmarkSimple(b *testing.B) {
 		name string
 	}{
 		// ASCII
-		{"levenshtein", "frankenstein", "ASCII"},
+		{a: "levenshtein", b: "frankenstein", name: "ASCII"},
 		// Testing acutes and umlauts
-		{"resumé and café", "resumés and cafés", "French"},
-		{"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", "Nordic"},
-		{"a very long string that is meant to exceed", "another very long string that is meant to exceed", "long string"},
+		{a: "resumé and café", b: "resumés and cafés", name: "French"},
+		{a: "Hafþór Júlíus Björnsson", b: "Hafþor Julius Bjornsson", name: "Nordic"},
+
+		// Long strings
+		{
+			a:    "a very long string that is meant to exceed",
+			b:    "another very long string that is meant to exceed",
+			name: "Long lead",
+		},
+		{
+			a:    "a very long string with a word in the middle that is different",
+			b:    "a very long string with some text in the middle that is different",
+			name: "Long middle",
+		},
+		{
+			a:    "a very long string with some text at the end that is not the same",
+			b:    "a very long string with some text at the end that is very different",
+			name: "Long trail",
+		},
+		{
+			a:    "+a very long string with different leading and trailing characters+",
+			b:    "-a very long string with different leading and trailing characters-",
+			name: "Long diff",
+		},
+
 		// Only 2 characters are less in the 2nd string
-		{"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", "Tibetan"},
+		{a: "།་གམ་འས་པ་་མ།", b: "།་གམའས་པ་་མ", name: "Tibetan"},
 	}
 	tmp := 0
 	for _, test := range tests {