Merge pull request #184 from bzz/maintenance/update-benchmark

Update benchmarks to latest Enry and Github-Linguist
src-d · Dec 28, 2018 · f28fc12 · f28fc12
2 parents ef50154 + 890afc4
commit f28fc12
Show file tree

Hide file tree

Showing 10 changed files with 28,117 additions and 25,770 deletions.
diff --git a/README.md b/README.md
@@ -217,21 +217,35 @@ Golang's regexp engine being slower than Ruby's, which uses the [oniguruma](http
 You can find scripts and additional information (like software and hardware used
 and benchmarks' results per sample file) in [*benchmarks*](https://github.com/src-d/enry/blob/master/benchmarks) directory.
 
-If you want to reproduce the same benchmarks you can run:
 
-    benchmarks/run.sh
+### Benchmark Dependencies
+As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
+ - Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
+ - Docker
+ - [native dependencies](https://github.com/github/linguist/#dependencies) installed
+ - Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
+ - Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
 
-from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
 
-This can take some time, so to run local benchmarks for a quick check you can either:
+### How to reproduce current results
+
+If you want to reproduce the same benchmarks as reported above:
+ - Make sure all [dependencies](#benchmark-dependencies) are installed
+ - Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
+ - Run `ENRY_TEST_REPO=.linguist benchmarks/run.sh` (takes ~15h)
+
+It will run the benchmarks for enry and linguist, parse the output, create csv files and plot the histogram. This takes some time.
+
+### Quick
+To run quicker benchmarks you can either:
 
     make benchmarks
 
 to get average times for the main detection function and strategies for the whole samples set or:
 
     make benchmarks-samples
 
-if you want to see measures by sample file.
+if you want to see measures per sample file.
 
 
 Why Enry?

diff --git a/benchmark_test.go b/benchmark_test.go
@@ -2,6 +2,7 @@ package enry
 
 import (
 	"flag"
+	"fmt"
 	"io/ioutil"
 	"log"
 	"os"
@@ -110,11 +111,9 @@ func getSamples(dir string) ([]*sample, error) {
 			filename: path,
 			content:  content,
 		}
-
 		samples = append(samples, s)
 		return nil
 	})
-
 	return samples, err
 }
 
@@ -157,17 +156,7 @@ func BenchmarkStrategiesTotal(b *testing.B) {
 		b.SkipNow()
 	}
 
-	benchmarks := []struct {
-		name       string
-		strategy   Strategy
-		candidates []string
-	}{
-		{name: "GetLanguagesByModeline()_TOTAL", strategy: GetLanguagesByModeline},
-		{name: "GetLanguagesByFilename()_TOTAL", strategy: GetLanguagesByFilename},
-		{name: "GetLanguagesByShebang()_TOTAL", strategy: GetLanguagesByShebang},
-		{name: "GetLanguagesByExtension()_TOTAL", strategy: GetLanguagesByExtension},
-		{name: "GetLanguagesByContent()_TOTAL", strategy: GetLanguagesByContent},
-	}
+	benchmarks := benchmarkForAllStrategies("TOTAL")
 
 	var o []string
 	for _, benchmark := range benchmarks {
@@ -222,17 +211,7 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
 		b.SkipNow()
 	}
 
-	benchmarks := []struct {
-		name       string
-		strategy   Strategy
-		candidates []string
-	}{
-		{name: "GetLanguagesByModeline()_SAMPLE_", strategy: GetLanguagesByModeline},
-		{name: "GetLanguagesByFilename()_SAMPLE_", strategy: GetLanguagesByFilename},
-		{name: "GetLanguagesByShebang()_SAMPLE_", strategy: GetLanguagesByShebang},
-		{name: "GetLanguagesByExtension()_SAMPLE_", strategy: GetLanguagesByExtension},
-		{name: "GetLanguagesByContent()_SAMPLE_", strategy: GetLanguagesByContent},
-	}
+	benchmarks := benchmarkForAllStrategies("SAMPLE")
 
 	var o []string
 	for _, benchmark := range benchmarks {
@@ -247,3 +226,19 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
 		}
 	}
 }
+
+type strategyName struct {
+	name       string
+	strategy   Strategy
+	candidates []string
+}
+
+func benchmarkForAllStrategies(class string) []strategyName {
+	return []strategyName{
+		{name: fmt.Sprintf("GetLanguagesByModeline()_%s_", class), strategy: GetLanguagesByModeline},
+		{name: fmt.Sprintf("GetLanguagesByFilename()_%s_", class), strategy: GetLanguagesByFilename},
+		{name: fmt.Sprintf("GetLanguagesByShebang()_%s_", class), strategy: GetLanguagesByShebang},
+		{name: fmt.Sprintf("GetLanguagesByExtension()_%s_", class), strategy: GetLanguagesByExtension},
+		{name: fmt.Sprintf("GetLanguagesByContent()_%s_", class), strategy: GetLanguagesByContent},
+	}
+}
diff --git a/benchmarks/csv/enry-distribution.csv b/benchmarks/csv/enry-distribution.csv
@@ -1,6 +1,6 @@
 timeInterval,enry,numberOfFiles
-1us-10us,enry,96
-10us-100us,enry,1244
-100us-1ms,enry,321
-1ms-10ms,enry,135
-10ms-100ms,enry,43
+1us-10us,enry,83
+10us-100us,enry,1341
+100us-1ms,enry,314
+1ms-10ms,enry,146
+10ms-100ms,enry,48