From fac16fee5348a9e31bde6afdf8a00218dcc63941 Mon Sep 17 00:00:00 2001 From: psadac Date: Mon, 23 Sep 2024 21:07:30 +0200 Subject: [PATCH] Add tests on long strings with few different characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmarks are run before and after optimization "Remove leading and trailing identical runes". Long strings with differences at the beginning (long_lead), in the middle (long_middle) or at the end (long_trail) show significant improvements in processing time and memory allocations. When the optimization is ineffective due to different leading and trailing characters (long_diff) there is no change in processing time or memory allocation. goos: linux goarch: amd64 pkg: github.com/agnivade/levenshtein cpu: AMD Ryzen 7 7840U w/ Radeon 780M Graphics │ before.txt │ after.txt │ │ sec/op │ sec/op vs base │ Simple/ASCII-16 134.20n ± 0% 79.03n ± 0% -41.11% (p=0.000 n=20) Simple/French-16 254.8n ± 0% 129.7n ± 0% -49.09% (p=0.000 n=20) Simple/Nordic-16 500.6n ± 1% 208.0n ± 0% -58.45% (p=0.000 n=20) Simple/Long_lead-16 1862.0n ± 0% 209.6n ± 1% -88.75% (p=0.000 n=20) Simple/Long_middle-16 3613.0n ± 0% 325.0n ± 0% -91.00% (p=0.000 n=20) Simple/Long_trail-16 3911.0n ± 0% 399.0n ± 1% -89.80% (p=0.000 n=20) Simple/Long_diff-16 4.030µ ± 0% 4.029µ ± 1% ~ (p=0.899 n=20) Simple/Tibetan-16 413.0n ± 0% 277.3n ± 0% -32.86% (p=0.000 n=20) geomean 964.6n 299.5n -68.95% │ before.txt │ after.txt │ │ B/op │ B/op vs base │ Simple/ASCII-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/French-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Nordic-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Long_lead-16 464.0 ± 0% 368.0 ± 0% -20.69% (p=0.000 n=20) Simple/Long_middle-16 672.0 ± 0% 544.0 ± 0% -19.05% (p=0.000 n=20) Simple/Long_trail-16 720.0 ± 0% 576.0 ± 0% -20.00% (p=0.000 n=20) Simple/Long_diff-16 720.0 ± 0% 720.0 ± 0% ~ (p=1.000 n=20) ¹ Simple/Tibetan-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ geomean ² -7.99% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ before.txt │ after.txt │ │ allocs/op │ allocs/op vs base │ Simple/ASCII-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/French-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Nordic-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Long_lead-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Long_middle-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Long_trail-16 3.000 ± 0% 2.000 ± 0% -33.33% (p=0.000 n=20) Simple/Long_diff-16 3.000 ± 0% 3.000 ± 0% ~ (p=1.000 n=20) ¹ Simple/Tibetan-16 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=20) ¹ geomean ² -14.11% ² ¹ all samples are equal ² summaries must be >0 to compute geomean --- levenshtein_test.go | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/levenshtein_test.go b/levenshtein_test.go index dd296d3..dd3607a 100644 --- a/levenshtein_test.go +++ b/levenshtein_test.go @@ -66,13 +66,35 @@ func BenchmarkSimple(b *testing.B) { name string }{ // ASCII - {"levenshtein", "frankenstein", "ASCII"}, + {a: "levenshtein", b: "frankenstein", name: "ASCII"}, // Testing acutes and umlauts - {"resumé and café", "resumés and cafés", "French"}, - {"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", "Nordic"}, - {"a very long string that is meant to exceed", "another very long string that is meant to exceed", "long string"}, + {a: "resumé and café", b: "resumés and cafés", name: "French"}, + {a: "Hafþór Júlíus Björnsson", b: "Hafþor Julius Bjornsson", name: "Nordic"}, + + // Long strings + { + a: "a very long string that is meant to exceed", + b: "another very long string that is meant to exceed", + name: "Long lead", + }, + { + a: "a very long string with a word in the middle that is different", + b: "a very long string with some text in the middle that is different", + name: "Long middle", + }, + { + a: "a very long string with some text at the end that is not the same", + b: "a very long string with some text at the end that is very different", + name: "Long trail", + }, + { + a: "+a very long string with different leading and trailing characters+", + b: "-a very long string with different leading and trailing characters-", + name: "Long diff", + }, + // Only 2 characters are less in the 2nd string - {"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", "Tibetan"}, + {a: "།་གམ་འས་པ་་མ།", b: "།་གམའས་པ་་མ", name: "Tibetan"}, } tmp := 0 for _, test := range tests {