Skip to content

Commit

Permalink
Merge pull request #184 from bzz/maintenance/update-benchmark
Browse files Browse the repository at this point in the history
Update benchmarks to latest Enry and Github-Linguist
  • Loading branch information
bzz authored Dec 28, 2018
2 parents ef50154 + 890afc4 commit f28fc12
Show file tree
Hide file tree
Showing 10 changed files with 28,117 additions and 25,770 deletions.
24 changes: 19 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,21 +217,35 @@ Golang's regexp engine being slower than Ruby's, which uses the [oniguruma](http
You can find scripts and additional information (like software and hardware used
and benchmarks' results per sample file) in [*benchmarks*](https://github.com/src-d/enry/blob/master/benchmarks) directory.

If you want to reproduce the same benchmarks you can run:

benchmarks/run.sh
### Benchmark Dependencies
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
- Docker
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`

from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).

This can take some time, so to run local benchmarks for a quick check you can either:
### How to reproduce current results

If you want to reproduce the same benchmarks as reported above:
- Make sure all [dependencies](#benchmark-dependencies) are installed
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
- Run `ENRY_TEST_REPO=.linguist benchmarks/run.sh` (takes ~15h)

It will run the benchmarks for enry and linguist, parse the output, create csv files and plot the histogram. This takes some time.

### Quick
To run quicker benchmarks you can either:

make benchmarks

to get average times for the main detection function and strategies for the whole samples set or:

make benchmarks-samples

if you want to see measures by sample file.
if you want to see measures per sample file.


Why Enry?
Expand Down
43 changes: 19 additions & 24 deletions benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package enry

import (
"flag"
"fmt"
"io/ioutil"
"log"
"os"
Expand Down Expand Up @@ -110,11 +111,9 @@ func getSamples(dir string) ([]*sample, error) {
filename: path,
content: content,
}

samples = append(samples, s)
return nil
})

return samples, err
}

Expand Down Expand Up @@ -157,17 +156,7 @@ func BenchmarkStrategiesTotal(b *testing.B) {
b.SkipNow()
}

benchmarks := []struct {
name string
strategy Strategy
candidates []string
}{
{name: "GetLanguagesByModeline()_TOTAL", strategy: GetLanguagesByModeline},
{name: "GetLanguagesByFilename()_TOTAL", strategy: GetLanguagesByFilename},
{name: "GetLanguagesByShebang()_TOTAL", strategy: GetLanguagesByShebang},
{name: "GetLanguagesByExtension()_TOTAL", strategy: GetLanguagesByExtension},
{name: "GetLanguagesByContent()_TOTAL", strategy: GetLanguagesByContent},
}
benchmarks := benchmarkForAllStrategies("TOTAL")

var o []string
for _, benchmark := range benchmarks {
Expand Down Expand Up @@ -222,17 +211,7 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
b.SkipNow()
}

benchmarks := []struct {
name string
strategy Strategy
candidates []string
}{
{name: "GetLanguagesByModeline()_SAMPLE_", strategy: GetLanguagesByModeline},
{name: "GetLanguagesByFilename()_SAMPLE_", strategy: GetLanguagesByFilename},
{name: "GetLanguagesByShebang()_SAMPLE_", strategy: GetLanguagesByShebang},
{name: "GetLanguagesByExtension()_SAMPLE_", strategy: GetLanguagesByExtension},
{name: "GetLanguagesByContent()_SAMPLE_", strategy: GetLanguagesByContent},
}
benchmarks := benchmarkForAllStrategies("SAMPLE")

var o []string
for _, benchmark := range benchmarks {
Expand All @@ -247,3 +226,19 @@ func BenchmarkStrategiesPerSample(b *testing.B) {
}
}
}

type strategyName struct {
name string
strategy Strategy
candidates []string
}

func benchmarkForAllStrategies(class string) []strategyName {
return []strategyName{
{name: fmt.Sprintf("GetLanguagesByModeline()_%s_", class), strategy: GetLanguagesByModeline},
{name: fmt.Sprintf("GetLanguagesByFilename()_%s_", class), strategy: GetLanguagesByFilename},
{name: fmt.Sprintf("GetLanguagesByShebang()_%s_", class), strategy: GetLanguagesByShebang},
{name: fmt.Sprintf("GetLanguagesByExtension()_%s_", class), strategy: GetLanguagesByExtension},
{name: fmt.Sprintf("GetLanguagesByContent()_%s_", class), strategy: GetLanguagesByContent},
}
}
10 changes: 5 additions & 5 deletions benchmarks/csv/enry-distribution.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
timeInterval,enry,numberOfFiles
1us-10us,enry,96
10us-100us,enry,1244
100us-1ms,enry,321
1ms-10ms,enry,135
10ms-100ms,enry,43
1us-10us,enry,83
10us-100us,enry,1341
100us-1ms,enry,314
1ms-10ms,enry,146
10ms-100ms,enry,48
Loading

0 comments on commit f28fc12

Please sign in to comment.