Merge pull request #18 from chrisport/develop

v0.2
chrisport · Sep 6, 2017 · 19cc075 · 19cc075
2 parents 090d669 + f02eda4
commit 19cc075
Show file tree

Hide file tree

Showing 22 changed files with 1,116 additions and 178 deletions.
diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json
diff --git a/Godeps/Readme b/Godeps/Readme
diff --git a/README.md b/README.md
@@ -1,9 +1,12 @@
 [![wercker status](https://app.wercker.com/status/9e2a695f35c1cf5e1cac46035e4ca7a6/m/ "wercker status")](https://app.wercker.com/project/byKey/9e2a695f35c1cf5e1cac46035e4ca7a6)
 [![Coverage Status](https://img.shields.io/coveralls/chrisport/go-lang-detector.svg)](https://coveralls.io/r/chrisport/go-lang-detector?branch=master)
-# Language Detector
 
-This golang library provides functionality to analyze and recognize language based on text.  
+Breaking changes in v0.2: please see chapter "Migration" below.
+Previous version is available under Release v0.1: https://github.com/chrisport/go-lang-detector/releases/tag/v0.1
+
+# Language Detector
 
+This golang library provides functionality to analyze and recognize language based on text.
 
 The implementation is based on the following paper:  
 N-Gram-Based Text Categorization  
@@ -15,6 +18,13 @@ Ann Arbor MI 48113-4001
 A language profile is a ```map[string] int```that maps n-gram tokens to its occurrency-rank. So for the most
 frequent token 'X' of the analyzed text, map['X'] will be 1.
 
+### Detection by unicode range
+A second way to detect the language is by the unicode range used in the text.
+Golang has a set of predefined unicode ranges in package unicode, which can be used
+easily, for example for detecting Chinese/Japanese/Korean:
+``` go
+var CHINESE_JAPANESE_KOREAN = &langdet.UnicodeRangeLanguageComparator{"CJK", unicode.Han}
+```
 ## Usage
 ### Detect
 #### Get the closest language:
@@ -51,10 +61,6 @@ GetLanguage, which will return you all analyzed languages and their percentage o
 
  ```
 
-#### Use default languages
-In order to use default languages, the file default_languages.json must be placed in the same directory as the binary.
-Alternatively it can be anywhere on the filesystem and initialized by calling InitWithDefault with the filepath.
-
 ### Analyze new language
 
 For analysing a new language random Wikipedia articles in the target languages are ideal. The result will be a Language object, containing the specified name and the profile
@@ -88,6 +94,33 @@ Alternatively Analyze can be used and the resulting language can added using Add
     detectorC.AddLanguage(french)
 ```
 
+## Migration to v0.2
+
+This library has been adapted to a more convenient and more idiomatic way.
+- Default languages are provided in Go code and there is no need for adding the json file anymore.
+- All code related to defaults has been moved to package langdetdef
+- Default languages can be added using the provided interfaces:
+``` go
+// detector with default languages
+detector := langdetdef.NewWithDefaultLanguages()
+
+// add all to existing detector
+defaults := langdetdef.DefaultLanguages()
+detector.AddLanguageComparators(defaults...)
+
+// add selectively
+detector.AddLanguageComparators(langdetdef.CHINESE_JAPANESE_KOREAN, langdetdef.ENGLISH)
+```
+- InitWithDefaultFromXY has been removed, custom default languages can be unmarshaled manually and added to a detector through
+AddLanguage interface:
+```
+detector := langdet.NewDetector()
+customLanguages := []langdet.Language{}
+
+_ = json.Unmarshal(bytesFromFile, &customLanguages)
+detector.AddLanguage(customLanguages...)
+```
+
 ## Contribution
 
 Suggestions and Bug reports can be made through Github issues.

diff --git a/default_languages.json b/default_languages.json
diff --git a/example.go b/example.go
@@ -2,58 +2,25 @@ package main
 
 import (
 	"fmt"
-	"github.com/chrisport/go-lang-detector/langdet"
-	"io/ioutil"
-	"os"
+	"github.com/chrisport/go-lang-detector/langdet/langdetdef"
 )
 
-/*
-analyzed.json includes:
-Arabic, English, French, German, Hebrew, Russian, Turkish
-*/
 func main() {
+	detector := langdetdef.NewWithDefaultLanguages()
 
-	//sample using Reader to Initialize default languages
-	//	analyzedInput, _ := ioutil.ReadFile("default_languages2.json")
-	//	s := string(analyzedInput[:1652088])
-	//	langdet.InitWithDefaultFromReader(strings.NewReader(s))
-	//	detector := langdet.NewDefaultLanguages()
-
-	//sample by manually analyzing languages
-	//	 Analyze different languages from files and and write to analyzed.json:
-	detector := langdet.Detector{}
-	detector.AddLanguageFromText(GetTextFromFile("samples/english.txt"), "english")
-	detector.AddLanguageFromText(GetTextFromFile("samples/german.txt"), "german")
-	detector.AddLanguageFromText(GetTextFromFile("samples/french.txt"), "french")
-	detector.AddLanguageFromText(GetTextFromFile("samples/turkish.txt"), "turkish")
-	detector.AddLanguageFromText(GetTextFromFile("samples/arabic"), "arabic")
-	detector.AddLanguageFromText(GetTextFromFile("samples/hebrew"), "hebrew")
-	detector.AddLanguageFromText(GetTextFromFile("samples/russian"), "russian")
-	testString := GetTextFromFile("example_input.txt")
-	result := detector.GetClosestLanguage(testString)
+	result := detector.GetClosestLanguage("ont ne comprend rien")
 	fmt.Println("GetClosestLanguage returns:\n", "    ", result)
 
-	fullResults := detector.GetLanguages(testString)
+	fullResults := detector.GetLanguages("ont ne comprend rien")
 	fmt.Println("GetLanguages returns:")
 	for _, r := range fullResults {
 		fmt.Println("    ", r.Name, r.Confidence, "%")
 	}
 
-}
-
-// GetTextFromFile returns the content of file (identified by given fileName) as text
-func GetTextFromFile(fileName string) string {
-	text, err := ioutil.ReadFile(fileName)
-	if err != nil {
-		panic(err)
+	fullResults = detector.GetLanguages("义勇军进行曲")
+	fmt.Println("GetLanguages for Chinese returns:")
+	for _, r := range fullResults {
+		fmt.Println("    ", r.Name, r.Confidence, "%")
 	}
-	return string(text)
-}
 
-// WriteToFile writes a content into a file with specified name
-func WriteToFile(content []byte, fileName string) {
-	err := ioutil.WriteFile(fileName, content, os.ModePerm)
-	if err != nil {
-		panic(err)
-	}
 }
diff --git a/example_input.txt b/example_input.txt
diff --git a/langdet/analyzing.go b/langdet/analyzing.go
@@ -12,8 +12,12 @@ import (
 // -1 for no maximum
 var maxSampleSize = 10000
 
-// Analyze creates the language profile from a given Text and returns it in a Language struct.
 func Analyze(text, name string) Language {
+	return AnalyzeWithNDepth(text, name, DEFAULT_NDEPTH)
+}
+
+// Analyze creates the language profile from a given Text and returns it in a Language struct.
+func AnalyzeWithNDepth(text, name string, nDepth int) Language {
 	theMap := CreateOccurenceMap(text, nDepth)
 	ranked := CreateRankLookupMap(theMap)
 	return Language{Name: name, Profile: ranked}
@@ -69,7 +73,7 @@ func generateNthGrams(resultMap map[string]int, text string, n int) {
 	text = padding + text + padding
 	upperBound := utf8.RuneCountInString(text) - (n - 1)
 	for p := 0; p < upperBound; p++ {
-		currentToken := text[p : p+n]
+		currentToken := text[p: p+n]
 		resultMap[currentToken] += 1
 	}
 }

diff --git a/langdet/analyzing_test.go b/langdet/analyzing_test.go
@@ -1,8 +1,8 @@
 package langdet_test
 
 import (
-	. "github.com/smartystreets/goconvey/convey"
 	"github.com/chrisport/go-lang-detector/langdet"
+	. "github.com/smartystreets/goconvey/convey"
 	"testing"
 )