Skip to content

Commit

Permalink
Merge pull request #18 from chrisport/develop
Browse files Browse the repository at this point in the history
v0.2
  • Loading branch information
chrisport authored Sep 6, 2017
2 parents 090d669 + f02eda4 commit 19cc075
Show file tree
Hide file tree
Showing 22 changed files with 1,116 additions and 178 deletions.
12 changes: 12 additions & 0 deletions Godeps/Godeps.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Godeps/Readme

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 39 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
[![wercker status](https://app.wercker.com/status/9e2a695f35c1cf5e1cac46035e4ca7a6/m/ "wercker status")](https://app.wercker.com/project/byKey/9e2a695f35c1cf5e1cac46035e4ca7a6)
[![Coverage Status](https://img.shields.io/coveralls/chrisport/go-lang-detector.svg)](https://coveralls.io/r/chrisport/go-lang-detector?branch=master)
# Language Detector

This golang library provides functionality to analyze and recognize language based on text.
Breaking changes in v0.2: please see chapter "Migration" below.
Previous version is available under Release v0.1: https://github.com/chrisport/go-lang-detector/releases/tag/v0.1

# Language Detector

This golang library provides functionality to analyze and recognize language based on text.

The implementation is based on the following paper:
N-Gram-Based Text Categorization
Expand All @@ -15,6 +18,13 @@ Ann Arbor MI 48113-4001
A language profile is a ```map[string] int```that maps n-gram tokens to its occurrency-rank. So for the most
frequent token 'X' of the analyzed text, map['X'] will be 1.

### Detection by unicode range
A second way to detect the language is by the unicode range used in the text.
Golang has a set of predefined unicode ranges in package unicode, which can be used
easily, for example for detecting Chinese/Japanese/Korean:
``` go
var CHINESE_JAPANESE_KOREAN = &langdet.UnicodeRangeLanguageComparator{"CJK", unicode.Han}
```
## Usage
### Detect
#### Get the closest language:
Expand Down Expand Up @@ -51,10 +61,6 @@ GetLanguage, which will return you all analyzed languages and their percentage o
```

#### Use default languages
In order to use default languages, the file default_languages.json must be placed in the same directory as the binary.
Alternatively it can be anywhere on the filesystem and initialized by calling InitWithDefault with the filepath.

### Analyze new language

For analysing a new language random Wikipedia articles in the target languages are ideal. The result will be a Language object, containing the specified name and the profile
Expand Down Expand Up @@ -88,6 +94,33 @@ Alternatively Analyze can be used and the resulting language can added using Add
detectorC.AddLanguage(french)
```

## Migration to v0.2

This library has been adapted to a more convenient and more idiomatic way.
- Default languages are provided in Go code and there is no need for adding the json file anymore.
- All code related to defaults has been moved to package langdetdef
- Default languages can be added using the provided interfaces:
``` go
// detector with default languages
detector := langdetdef.NewWithDefaultLanguages()

// add all to existing detector
defaults := langdetdef.DefaultLanguages()
detector.AddLanguageComparators(defaults...)

// add selectively
detector.AddLanguageComparators(langdetdef.CHINESE_JAPANESE_KOREAN, langdetdef.ENGLISH)
```
- InitWithDefaultFromXY has been removed, custom default languages can be unmarshaled manually and added to a detector through
AddLanguage interface:
```
detector := langdet.NewDetector()
customLanguages := []langdet.Language{}
_ = json.Unmarshal(bytesFromFile, &customLanguages)
detector.AddLanguage(customLanguages...)
```

## Contribution

Suggestions and Bug reports can be made through Github issues.
Expand Down
Empty file modified default_languages.json
100755 → 100644
Empty file.
49 changes: 8 additions & 41 deletions example.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,25 @@ package main

import (
"fmt"
"github.com/chrisport/go-lang-detector/langdet"
"io/ioutil"
"os"
"github.com/chrisport/go-lang-detector/langdet/langdetdef"
)

/*
analyzed.json includes:
Arabic, English, French, German, Hebrew, Russian, Turkish
*/
func main() {
detector := langdetdef.NewWithDefaultLanguages()

//sample using Reader to Initialize default languages
// analyzedInput, _ := ioutil.ReadFile("default_languages2.json")
// s := string(analyzedInput[:1652088])
// langdet.InitWithDefaultFromReader(strings.NewReader(s))
// detector := langdet.NewDefaultLanguages()

//sample by manually analyzing languages
// Analyze different languages from files and and write to analyzed.json:
detector := langdet.Detector{}
detector.AddLanguageFromText(GetTextFromFile("samples/english.txt"), "english")
detector.AddLanguageFromText(GetTextFromFile("samples/german.txt"), "german")
detector.AddLanguageFromText(GetTextFromFile("samples/french.txt"), "french")
detector.AddLanguageFromText(GetTextFromFile("samples/turkish.txt"), "turkish")
detector.AddLanguageFromText(GetTextFromFile("samples/arabic"), "arabic")
detector.AddLanguageFromText(GetTextFromFile("samples/hebrew"), "hebrew")
detector.AddLanguageFromText(GetTextFromFile("samples/russian"), "russian")
testString := GetTextFromFile("example_input.txt")
result := detector.GetClosestLanguage(testString)
result := detector.GetClosestLanguage("ont ne comprend rien")
fmt.Println("GetClosestLanguage returns:\n", " ", result)

fullResults := detector.GetLanguages(testString)
fullResults := detector.GetLanguages("ont ne comprend rien")
fmt.Println("GetLanguages returns:")
for _, r := range fullResults {
fmt.Println(" ", r.Name, r.Confidence, "%")
}

}

// GetTextFromFile returns the content of file (identified by given fileName) as text
func GetTextFromFile(fileName string) string {
text, err := ioutil.ReadFile(fileName)
if err != nil {
panic(err)
fullResults = detector.GetLanguages("义勇军进行曲")
fmt.Println("GetLanguages for Chinese returns:")
for _, r := range fullResults {
fmt.Println(" ", r.Name, r.Confidence, "%")
}
return string(text)
}

// WriteToFile writes a content into a file with specified name
func WriteToFile(content []byte, fileName string) {
err := ioutil.WriteFile(fileName, content, os.ModePerm)
if err != nil {
panic(err)
}
}
1 change: 0 additions & 1 deletion example_input.txt

This file was deleted.

8 changes: 6 additions & 2 deletions langdet/analyzing.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,12 @@ import (
// -1 for no maximum
var maxSampleSize = 10000

// Analyze creates the language profile from a given Text and returns it in a Language struct.
func Analyze(text, name string) Language {
return AnalyzeWithNDepth(text, name, DEFAULT_NDEPTH)
}

// Analyze creates the language profile from a given Text and returns it in a Language struct.
func AnalyzeWithNDepth(text, name string, nDepth int) Language {
theMap := CreateOccurenceMap(text, nDepth)
ranked := CreateRankLookupMap(theMap)
return Language{Name: name, Profile: ranked}
Expand Down Expand Up @@ -69,7 +73,7 @@ func generateNthGrams(resultMap map[string]int, text string, n int) {
text = padding + text + padding
upperBound := utf8.RuneCountInString(text) - (n - 1)
for p := 0; p < upperBound; p++ {
currentToken := text[p : p+n]
currentToken := text[p: p+n]
resultMap[currentToken] += 1
}
}
Expand Down
2 changes: 1 addition & 1 deletion langdet/analyzing_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package langdet_test

import (
. "github.com/smartystreets/goconvey/convey"
"github.com/chrisport/go-lang-detector/langdet"
. "github.com/smartystreets/goconvey/convey"
"testing"
)

Expand Down
Loading

0 comments on commit 19cc075

Please sign in to comment.