From 7060e87f92e8ee4dbb701ecc1c1907c71ed4e255 Mon Sep 17 00:00:00 2001
From: Cotch22 <gglt666@126.com>
Date: Thu, 20 Jul 2023 11:26:03 +0800
Subject: [PATCH 1/3] Add vietnamese

---
 custom.go       |  61 ++++----
 simhash.go      |   2 +
 stopwords.go    |   6 +-
 stopwords_vi.go | 379 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 418 insertions(+), 30 deletions(-)
 create mode 100644 stopwords_vi.go

diff --git a/custom.go b/custom.go
index 8139714..425e64d 100644
--- a/custom.go
+++ b/custom.go
@@ -19,7 +19,7 @@ import (
 func LoadStopWordsFromFile(filePath string, langCode string, sep string) {
 	b, err := ioutil.ReadFile(filePath)
 	if err != nil {
-			panic(err)
+		panic(err)
 	}
 	LoadStopWordsFromString(string(b), langCode, sep)
 }
@@ -41,137 +41,142 @@ func LoadStopWordsFromString(wordsList string, langCode string, sep string) {
 	case "ar":
 		arabic = make(map[string]string)
 		for _, word := range words {
-				arabic[word] = ""
+			arabic[word] = ""
 		}
 	case "bg":
 		bulgarian = make(map[string]string)
 		for _, word := range words {
-				bulgarian[word] = ""
+			bulgarian[word] = ""
 		}
 	case "cs":
 		czech = make(map[string]string)
 		for _, word := range words {
-				czech[word] = ""
+			czech[word] = ""
 		}
 	case "da":
 		danish = make(map[string]string)
 		for _, word := range words {
-				danish[word] = ""
+			danish[word] = ""
 		}
 	case "de":
 		german = make(map[string]string)
 		for _, word := range words {
-				german[word] = ""
+			german[word] = ""
 		}
 	case "el":
 		greek = make(map[string]string)
 		for _, word := range words {
-				greek[word] = ""
+			greek[word] = ""
 		}
 	case "en":
 		english = make(map[string]string)
 		for _, word := range words {
-				english[word] = ""
+			english[word] = ""
 		}
 	case "es":
 		spanish = make(map[string]string)
 		for _, word := range words {
-				spanish[word] = ""
+			spanish[word] = ""
 		}
 	case "fa":
 		persian = make(map[string]string)
 		for _, word := range words {
-				persian[word] = ""
+			persian[word] = ""
 		}
 	case "fr":
 		french = make(map[string]string)
 		for _, word := range words {
-				french[word] = ""
+			french[word] = ""
 		}
 	case "fi":
 		finnish = make(map[string]string)
 		for _, word := range words {
-				finnish[word] = ""
+			finnish[word] = ""
 		}
 	case "hu":
 		hungarian = make(map[string]string)
 		for _, word := range words {
-				hungarian[word] = ""
+			hungarian[word] = ""
 		}
 	case "id":
 		indonesian = make(map[string]string)
 		for _, word := range words {
-				indonesian[word] = ""
+			indonesian[word] = ""
 		}
 	case "it":
 		italian = make(map[string]string)
 		for _, word := range words {
-				italian[word] = ""
+			italian[word] = ""
 		}
 	case "ja":
 		japanese = make(map[string]string)
 		for _, word := range words {
-				japanese[word] = ""
+			japanese[word] = ""
 		}
 	case "km":
 		khmer = make(map[string]string)
 		for _, word := range words {
-				khmer[word] = ""
+			khmer[word] = ""
 		}
 	case "lv":
 		latvian = make(map[string]string)
 		for _, word := range words {
-				latvian[word] = ""
+			latvian[word] = ""
 		}
 	case "nl":
 		dutch = make(map[string]string)
 		for _, word := range words {
-				dutch[word] = ""
+			dutch[word] = ""
 		}
 	case "no":
 		norwegian = make(map[string]string)
 		for _, word := range words {
-				norwegian[word] = ""
+			norwegian[word] = ""
 		}
 	case "pl":
 		polish = make(map[string]string)
 		for _, word := range words {
-				polish[word] = ""
+			polish[word] = ""
 		}
 	case "pt":
 		portuguese = make(map[string]string)
 		for _, word := range words {
-				portuguese[word] = ""
+			portuguese[word] = ""
 		}
 	case "ro":
 		romanian = make(map[string]string)
 		for _, word := range words {
-				romanian[word] = ""
+			romanian[word] = ""
 		}
 	case "ru":
 		russian = make(map[string]string)
 		for _, word := range words {
-				russian[word] = ""
+			russian[word] = ""
 		}
 	case "sk":
 		slovak = make(map[string]string)
 		for _, word := range words {
-				slovak[word] = ""
+			slovak[word] = ""
 		}
 	case "sv":
 		swedish = make(map[string]string)
 		for _, word := range words {
-				swedish[word] = ""
+			swedish[word] = ""
 		}
 	case "th":
 		thai = make(map[string]string)
 		for _, word := range words {
-				thai[word] = ""
+			thai[word] = ""
 		}
 	case "tr":
 		turkish = make(map[string]string)
 		for _, word := range words {
-				turkish[word] = ""
+			turkish[word] = ""
+		}
+	case "vi":
+		vietnamese = make(map[string]string)
+		for _, word := range words {
+			vietnamese[word] = ""
 		}
 	}
 }
diff --git a/simhash.go b/simhash.go
index 09c63c9..3a68430 100644
--- a/simhash.go
+++ b/simhash.go
@@ -95,6 +95,8 @@ func Simhash(content []byte, langCode string, cleanHTML bool) uint64 {
 		hash = removeStopWordsAndHash(content, thai)
 	case "tr":
 		hash = removeStopWordsAndHash(content, turkish)
+	case "vi":
+		hash = removeStopWordsAndHash(content, vietnamese)
 	}
 
 	return hash
diff --git a/stopwords.go b/stopwords.go
index c4b52d0..985af71 100644
--- a/stopwords.go
+++ b/stopwords.go
@@ -23,8 +23,8 @@ import (
 )
 
 var (
-	remTags      = regexp.MustCompile(`<[^>]*>`)
-	oneSpace     = regexp.MustCompile(`\s{2,}`)
+	remTags       = regexp.MustCompile(`<[^>]*>`)
+	oneSpace      = regexp.MustCompile(`\s{2,}`)
 	wordSegmenter = regexp.MustCompile(`[\pL\p{Mc}\p{Mn}-_']+`)
 )
 
@@ -118,6 +118,8 @@ func Clean(content []byte, langCode string, cleanHTML bool) []byte {
 		content = removeStopWords(content, thai)
 	case "tr":
 		content = removeStopWords(content, turkish)
+	case "vi":
+		content = removeStopWords(content, vietnamese)
 	}
 
 	//Remove duplicated space characters
diff --git a/stopwords_vi.go b/stopwords_vi.go
new file mode 100644
index 0000000..d2e6752
--- /dev/null
+++ b/stopwords_vi.go
@@ -0,0 +1,379 @@
+// Copyright (c) 2023, Cotch22.
+// Use of this source code is governed by the BSD license
+// license that can be found in the LICENSE file.
+
+package stopwords
+
+var vietnamese = map[string]string{
+	"ai":     "",
+	"alô":    "",
+	"amen":   "",
+	"anh":    "",
+	"ba":     "",
+	"biết":   "",
+	"buổi":   "",
+	"bà":     "",
+	"bài":    "",
+	"bác":    "",
+	"bán":    "",
+	"bèn":    "",
+	"béng":   "",
+	"bên":    "",
+	"bông":   "",
+	"bước":   "",
+	"bạn":    "",
+	"bản":    "",
+	"bấy":    "",
+	"bằng":   "",
+	"bển":    "",
+	"bệt":    "",
+	"bị":     "",
+	"bỏ":     "",
+	"bỗng":   "",
+	"bộ":     "",
+	"bớ":     "",
+	"bởi":    "",
+	"bức":    "",
+	"cao":    "",
+	"cha":    "",
+	"chiếc":  "",
+	"cho":    "",
+	"choa":   "",
+	"chung":  "",
+	"chuyển": "",
+	"chuyện": "",
+	"chính":  "",
+	"chú":    "",
+	"chúng":  "",
+	"chăng":  "",
+	"chơi":   "",
+	"chưa":   "",
+	"chậc":   "",
+	"chắc":   "",
+	"chỉ":    "",
+	"chỉn":   "",
+	"chị":    "",
+	"chịu":   "",
+	"chọn":   "",
+	"chớ":    "",
+	"chợt":   "",
+	"chủn":   "",
+	"chứ":    "",
+	"con":    "",
+	"cuối":   "",
+	"cuốn":   "",
+	"cuộc":   "",
+	"càng":   "",
+	"các":    "",
+	"cách":   "",
+	"cái":    "",
+	"cây":    "",
+	"còn":    "",
+	"có":     "",
+	"cô":     "",
+	"cùng":   "",
+	"căn":    "",
+	"cũng":   "",
+	"cơ":     "",
+	"cơn":    "",
+	"cả":     "",
+	"cấp":    "",
+	"cần":    "",
+	"cậu":    "",
+	"của":    "",
+	"cứ":     "",
+	"do":     "",
+	"duy":    "",
+	"dài":    "",
+	"dành":   "",
+	"dào":    "",
+	"dì":     "",
+	"dù":     "",
+	"dùng":   "",
+	"dưới":   "",
+	"dạ":     "",
+	"dẫn":    "",
+	"dẫu":    "",
+	"dễ":     "",
+	"dữ":     "",
+	"em":     "",
+	"giảm":   "",
+	"giống":  "",
+	"giờ":    "",
+	"giữ":    "",
+	"giữa":   "",
+	"gây":    "",
+	"gì":     "",
+	"gần":    "",
+	"gặp":    "",
+	"gồm":    "",
+	"hay":    "",
+	"hiểu":   "",
+	"hoặc":   "",
+	"hãy":    "",
+	"hơn":    "",
+	"hết":    "",
+	"họ":     "",
+	"hỏi":    "",
+	"khi":    "",
+	"khiến":  "",
+	"khoảng": "",
+	"khá":    "",
+	"khác":   "",
+	"khách":  "",
+	"khó":    "",
+	"không":  "",
+	"khỏi":   "",
+	"kể":     "",
+	"loại":   "",
+	"luôn":   "",
+	"là":     "",
+	"làm":    "",
+	"lâu":    "",
+	"lên":    "",
+	"lòng":   "",
+	"lúc":    "",
+	"lượng":  "",
+	"lại":    "",
+	"lấy":    "",
+	"lần":    "",
+	"lớn":    "",
+	"lời":    "",
+	"mang":   "",
+	"muốn":   "",
+	"mà":     "",
+	"mình":   "",
+	"mạnh":   "",
+	"mất":    "",
+	"mọi":    "",
+	"mối":    "",
+	"mỗi":    "",
+	"một":    "",
+	"mới":    "",
+	"mở":     "",
+	"mợ":     "",
+	"mức":    "",
+	"nay":    "",
+	"ngay":   "",
+	"nghe":   "",
+	"nghen":  "",
+	"nghĩ":   "",
+	"nghỉm":  "",
+	"ngoài":  "",
+	"ngoải":  "",
+	"nguồn":  "",
+	"ngày":   "",
+	"ngôi":   "",
+	"ngươi":  "",
+	"người":  "",
+	"ngọn":   "",
+	"ngọt":   "",
+	"ngồi":   "",
+	"nhanh":  "",
+	"nhau":   "",
+	"nhiều":  "",
+	"nhà":    "",
+	"nhé":    "",
+	"nhìn":   "",
+	"nhóm":   "",
+	"như":    "",
+	"nhưng":  "",
+	"nhất":   "",
+	"nhận":   "",
+	"nhằm":   "",
+	"nhỉ":    "",
+	"nhỏ":    "",
+	"nhớ":    "",
+	"nhờ":    "",
+	"những":  "",
+	"nào":    "",
+	"này":    "",
+	"nên":    "",
+	"nó":     "",
+	"nóc":    "",
+	"nói":    "",
+	"năm":    "",
+	"nơi":    "",
+	"nước":   "",
+	"nấy":    "",
+	"nặng":   "",
+	"nếu":    "",
+	"nền":    "",
+	"nọ":     "",
+	"nớ":     "",
+	"nữa":    "",
+	"oái":    "",
+	"pho":    "",
+	"phè":    "",
+	"phía":   "",
+	"phóc":   "",
+	"phót":   "",
+	"phải":   "",
+	"phần":   "",
+	"phắt":   "",
+	"phỏng":  "",
+	"phốc":   "",
+	"phụt":   "",
+	"phứt":   "",
+	"qua":    "",
+	"quay":   "",
+	"quá":    "",
+	"quả":    "",
+	"quận":   "",
+	"ra":     "",
+	"riêng":  "",
+	"riệt":   "",
+	"rày":    "",
+	"ráo":    "",
+	"rén":    "",
+	"rích":   "",
+	"rõ":     "",
+	"răng":   "",
+	"rất":    "",
+	"rằng":   "",
+	"rồi":    "",
+	"rứa":    "",
+	"sang":   "",
+	"sao":    "",
+	"sau":    "",
+	"so":     "",
+	"suýt":   "",
+	"sáng":   "",
+	"sì":     "",
+	"sất":    "",
+	"sắp":    "",
+	"sẽ":     "",
+	"số":     "",
+	"sớm":    "",
+	"sự":     "",
+	"tanh":   "",
+	"tay":    "",
+	"thanh":  "",
+	"theo":   "",
+	"thiếu":  "",
+	"thoạt":  "",
+	"thoắt":  "",
+	"thuần":  "",
+	"thuộc":  "",
+	"thà":    "",
+	"tháng":  "",
+	"thêm":   "",
+	"thì":    "",
+	"thích":  "",
+	"thím":   "",
+	"thôi":   "",
+	"thường": "",
+	"thấp":   "",
+	"thấy":   "",
+	"thẩy":   "",
+	"thậm":   "",
+	"thật":   "",
+	"thế":    "",
+	"thếch":  "",
+	"thỏm":   "",
+	"thốc":   "",
+	"thốt":   "",
+	"thộc":   "",
+	"thứ":    "",
+	"thửa":   "",
+	"tin":    "",
+	"toà":    "",
+	"toẹt":   "",
+	"trong":  "",
+	"tránh":  "",
+	"trên":   "",
+	"trước":  "",
+	"trả":    "",
+	"trển":   "",
+	"trệt":   "",
+	"trỏng":  "",
+	"tuy":    "",
+	"tuổi":   "",
+	"tên":    "",
+	"tênh":   "",
+	"tìm":    "",
+	"tính":   "",
+	"tôi":    "",
+	"tăng":   "",
+	"tại":    "",
+	"tạo":    "",
+	"tấm":    "",
+	"tấn":    "",
+	"tắp":    "",
+	"tọt":    "",
+	"tốt":    "",
+	"tột":    "",
+	"tớ":     "",
+	"tới":    "",
+	"từ":     "",
+	"từng":   "",
+	"tự":     "",
+	"veo":    "",
+	"việc":   "",
+	"và":     "",
+	"vài":    "",
+	"vào":    "",
+	"vâng":   "",
+	"vèo":    "",
+	"vì":     "",
+	"vùng":   "",
+	"vượt":   "",
+	"vẫn":    "",
+	"vậy":    "",
+	"về":     "",
+	"với":    "",
+	"vở":     "",
+	"vụt":    "",
+	"vừa":    "",
+	"xa":     "",
+	"xem":    "",
+	"xin":    "",
+	"xoét":   "",
+	"xoẳn":   "",
+	"xoẹt":   "",
+	"xuể":    "",
+	"xuống":  "",
+	"xệp":    "",
+	"à":      "",
+	"ào":     "",
+	"á":      "",
+	"ái":     "",
+	"áng":    "",
+	"ít":     "",
+	"ông":    "",
+	"úi":     "",
+	"ý":      "",
+	"ăn":     "",
+	"đang":   "",
+	"điều":   "",
+	"điểm":   "",
+	"đáng":   "",
+	"đâu":    "",
+	"đây":    "",
+	"đã":     "",
+	"đó":     "",
+	"đúng":   "",
+	"đưa":    "",
+	"được":   "",
+	"đạt":    "",
+	"đầy":    "",
+	"đặt":    "",
+	"đến":    "",
+	"đều":    "",
+	"để":     "",
+	"đủ":     "",
+	"ơ":      "",
+	"ơi":     "",
+	"ư":      "",
+	"ạ":      "",
+	"ấy":     "",
+	"ắt":     "",
+	"ồ":      "",
+	"ổng":    "",
+	"ớ":      "",
+	"ờ":      "",
+	"ở":      "",
+	"ủa":     "",
+	"ừ":      "",
+	"ử":      "",
+}

From 19c8cf40a2558b74d28babf095ce9c3a52b42729 Mon Sep 17 00:00:00 2001
From: Cotch22 <gglt666@126.com>
Date: Thu, 20 Jul 2023 11:26:53 +0800
Subject: [PATCH 2/3] Update gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index b15967c..a44d075 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.idea/
 coverage.out
 /nbproject/private/
 /nbproject/

From ab218014fbff7a661ad4015daef36cb9fc7bd6ed Mon Sep 17 00:00:00 2001
From: Cotch22 <gglt666@126.com>
Date: Thu, 20 Jul 2023 11:31:54 +0800
Subject: [PATCH 3/3] Update README

---
 README.md    | 1 +
 stopwords.go | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9a26011..ea6f24f 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ It uses a curated list of the most frequent words used in these languages:
  * Swedish
  * Thai
  * Turkish
+ * Vietnamese
 
 If the function is used with an unsupported language, it doesn't fail, but will apply english filter to the content.
 
diff --git a/stopwords.go b/stopwords.go
index 985af71..d57437e 100644
--- a/stopwords.go
+++ b/stopwords.go
@@ -8,7 +8,8 @@
 //
 // arabic, bulgarian, czech, danish, english, finnish, french, german,
 // hungarian, italian, japanese, latvian, norwegian, persian, polish,
-// portuguese, romanian, russian, slovak, spanish, swedish, turkish
+// portuguese, romanian, russian, slovak, spanish, swedish, turkish,
+// vietnamese
 
 // Package stopwords contains various algorithms of text comparison (Simhash, Levenshtein)
 package stopwords