Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

go: reduce API surface #248

Merged
merged 3 commits into from
Oct 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func BenchmarkClassifyTotal(b *testing.B) {
b.Run("Classify()_TOTAL", func(b *testing.B) {
for n := 0; n < b.N; n++ {
for _, sample := range samples {
o = DefaultClassifier.Classify(sample.content, nil)
o = defaultClassifier.classify(sample.content, nil)
}

overcomeLanguages = o
Expand Down Expand Up @@ -195,7 +195,7 @@ func BenchmarkClassifyPerSample(b *testing.B) {
for _, sample := range samples {
b.Run("Classify()_SAMPLE_"+sample.filename, func(b *testing.B) {
for n := 0; n < b.N; n++ {
o = DefaultClassifier.Classify(sample.content, nil)
o = defaultClassifier.classify(sample.content, nil)
}

overcomeLanguages = o
Expand Down
18 changes: 9 additions & 9 deletions classifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ import (
"github.com/src-d/enry/v2/internal/tokenizer"
)

// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates map[string]float64) (languages []string)
type classifier interface {
classify(content []byte, candidates map[string]float64) (languages []string)
}

type classifier struct {
type naiveBayes struct {
languagesLogProbabilities map[string]float64
tokensLogProbabilities map[string]map[string]float64
tokensTotal float64
Expand All @@ -24,8 +24,8 @@ type scoredLanguage struct {
score float64
}

// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
// classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *naiveBayes) classify(content []byte, candidates map[string]float64) []string {

var languages map[string]float64
if len(candidates) == 0 {
Expand Down Expand Up @@ -73,7 +73,7 @@ func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
return sortedLanguages
}

func (c *classifier) knownLangs() map[string]float64 {
func (c *naiveBayes) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities {
langs[lang]++
Expand All @@ -82,7 +82,7 @@ func (c *classifier) knownLangs() map[string]float64 {
return langs
}

func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
func (c *naiveBayes) tokensLogProbability(tokens []string, language string) float64 {
var sum float64
for _, token := range tokens {
sum += c.tokenProbability(token, language)
Expand All @@ -91,7 +91,7 @@ func (c *classifier) tokensLogProbability(tokens []string, language string) floa
return sum
}

func (c *classifier) tokenProbability(token, language string) float64 {
func (c *naiveBayes) tokenProbability(token, language string) float64 {
tokenProb, ok := c.tokensLogProbabilities[language][token]
if !ok {
tokenProb = math.Log(1.000000 / c.tokensTotal)
Expand Down
27 changes: 14 additions & 13 deletions common.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ var DefaultStrategies = []Strategy{
GetLanguagesByClassifier,
}

// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var DefaultClassifier Classifier = &classifier{
// defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var defaultClassifier classifier = &naiveBayes{
languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities,
tokensTotal: data.TokensTotal,
Expand Down Expand Up @@ -92,7 +92,7 @@ func GetLanguageByContent(filename string, content []byte) (language string, saf
}

// GetLanguageByClassifier returns the most probably language detected for the given content. It uses
// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
// defaultClassifier, if no candidates are provided it returns OtherLanguage.
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
}
Expand All @@ -108,10 +108,10 @@ func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
return
}

// GetLanguageBySpecificClassifier returns the most probably language for the given content using
// getLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language.
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages)
}

Expand Down Expand Up @@ -413,27 +413,28 @@ func GetLanguagesByContent(filename string, content []byte, _ []string) []string
return heuristic.Match(content)
}

// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
// decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
// GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
// decreasing language's probability. If there are not candidates it returns nil.
// It is a Strategy that uses a pre-trained defaultClassifier.
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
if len(candidates) == 0 {
return nil
}

return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
}

// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
// getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
mapCandidates := make(map[string]float64)
for _, candidate := range candidates {
mapCandidates[candidate]++
}

return classifier.Classify(content, mapCandidates)
return classifier.classify(content, mapCandidates)
}

// GetLanguageExtensions returns the different extensions being used by the language.
// GetLanguageExtensions returns all extensions associated with the given language.
func GetLanguageExtensions(language string) []string {
return data.ExtensionsByLanguage[language]
}
Expand Down
18 changes: 9 additions & 9 deletions common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,23 +332,23 @@ func (s *EnryTestSuite) TestGetLanguagesBySpecificClassifier() {
name string
filename string
candidates []string
classifier Classifier
classifier classifier
expected string
}{
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: nil, classifier: DefaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(s.samplesDir, "C/main.c"), candidates: []string{}, classifier: DefaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: DefaultClassifier, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: DefaultClassifier, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: DefaultClassifier, expected: "Python"},
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: DefaultClassifier, expected: "XML"},
{name: "TestGetLanguagesByClassifier_1", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_2", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: nil, classifier: defaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_3", filename: filepath.Join(s.samplesDir, "C/main.c"), candidates: []string{}, classifier: defaultClassifier, expected: "C"},
{name: "TestGetLanguagesByClassifier_4", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"python", "ruby", "c++"}, classifier: defaultClassifier, expected: "C++"},
{name: "TestGetLanguagesByClassifier_5", filename: filepath.Join(s.samplesDir, "C/blob.c"), candidates: []string{"ruby"}, classifier: defaultClassifier, expected: "Ruby"},
{name: "TestGetLanguagesByClassifier_6", filename: filepath.Join(s.samplesDir, "Python/django-models-base.py"), candidates: []string{"python", "ruby", "c", "c++"}, classifier: defaultClassifier, expected: "Python"},
{name: "TestGetLanguagesByClassifier_7", filename: os.DevNull, candidates: nil, classifier: defaultClassifier, expected: "XML"},
}

for _, test := range test {
content, err := ioutil.ReadFile(test.filename)
assert.NoError(s.T(), err)

languages := GetLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
languages := getLanguagesBySpecificClassifier(content, test.candidates, test.classifier)
var language string
if len(languages) == 0 {
language = OtherLanguage
Expand Down