From 7060e87f92e8ee4dbb701ecc1c1907c71ed4e255 Mon Sep 17 00:00:00 2001 From: Cotch22 Date: Thu, 20 Jul 2023 11:26:03 +0800 Subject: [PATCH 1/3] Add vietnamese --- custom.go | 61 ++++---- simhash.go | 2 + stopwords.go | 6 +- stopwords_vi.go | 379 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 418 insertions(+), 30 deletions(-) create mode 100644 stopwords_vi.go diff --git a/custom.go b/custom.go index 8139714..425e64d 100644 --- a/custom.go +++ b/custom.go @@ -19,7 +19,7 @@ import ( func LoadStopWordsFromFile(filePath string, langCode string, sep string) { b, err := ioutil.ReadFile(filePath) if err != nil { - panic(err) + panic(err) } LoadStopWordsFromString(string(b), langCode, sep) } @@ -41,137 +41,142 @@ func LoadStopWordsFromString(wordsList string, langCode string, sep string) { case "ar": arabic = make(map[string]string) for _, word := range words { - arabic[word] = "" + arabic[word] = "" } case "bg": bulgarian = make(map[string]string) for _, word := range words { - bulgarian[word] = "" + bulgarian[word] = "" } case "cs": czech = make(map[string]string) for _, word := range words { - czech[word] = "" + czech[word] = "" } case "da": danish = make(map[string]string) for _, word := range words { - danish[word] = "" + danish[word] = "" } case "de": german = make(map[string]string) for _, word := range words { - german[word] = "" + german[word] = "" } case "el": greek = make(map[string]string) for _, word := range words { - greek[word] = "" + greek[word] = "" } case "en": english = make(map[string]string) for _, word := range words { - english[word] = "" + english[word] = "" } case "es": spanish = make(map[string]string) for _, word := range words { - spanish[word] = "" + spanish[word] = "" } case "fa": persian = make(map[string]string) for _, word := range words { - persian[word] = "" + persian[word] = "" } case "fr": french = make(map[string]string) for _, word := range words { - french[word] = "" + french[word] = "" } case "fi": finnish = make(map[string]string) for _, word := range words { - finnish[word] = "" + finnish[word] = "" } case "hu": hungarian = make(map[string]string) for _, word := range words { - hungarian[word] = "" + hungarian[word] = "" } case "id": indonesian = make(map[string]string) for _, word := range words { - indonesian[word] = "" + indonesian[word] = "" } case "it": italian = make(map[string]string) for _, word := range words { - italian[word] = "" + italian[word] = "" } case "ja": japanese = make(map[string]string) for _, word := range words { - japanese[word] = "" + japanese[word] = "" } case "km": khmer = make(map[string]string) for _, word := range words { - khmer[word] = "" + khmer[word] = "" } case "lv": latvian = make(map[string]string) for _, word := range words { - latvian[word] = "" + latvian[word] = "" } case "nl": dutch = make(map[string]string) for _, word := range words { - dutch[word] = "" + dutch[word] = "" } case "no": norwegian = make(map[string]string) for _, word := range words { - norwegian[word] = "" + norwegian[word] = "" } case "pl": polish = make(map[string]string) for _, word := range words { - polish[word] = "" + polish[word] = "" } case "pt": portuguese = make(map[string]string) for _, word := range words { - portuguese[word] = "" + portuguese[word] = "" } case "ro": romanian = make(map[string]string) for _, word := range words { - romanian[word] = "" + romanian[word] = "" } case "ru": russian = make(map[string]string) for _, word := range words { - russian[word] = "" + russian[word] = "" } case "sk": slovak = make(map[string]string) for _, word := range words { - slovak[word] = "" + slovak[word] = "" } case "sv": swedish = make(map[string]string) for _, word := range words { - swedish[word] = "" + swedish[word] = "" } case "th": thai = make(map[string]string) for _, word := range words { - thai[word] = "" + thai[word] = "" } case "tr": turkish = make(map[string]string) for _, word := range words { - turkish[word] = "" + turkish[word] = "" + } + case "vi": + vietnamese = make(map[string]string) + for _, word := range words { + vietnamese[word] = "" } } } diff --git a/simhash.go b/simhash.go index 09c63c9..3a68430 100644 --- a/simhash.go +++ b/simhash.go @@ -95,6 +95,8 @@ func Simhash(content []byte, langCode string, cleanHTML bool) uint64 { hash = removeStopWordsAndHash(content, thai) case "tr": hash = removeStopWordsAndHash(content, turkish) + case "vi": + hash = removeStopWordsAndHash(content, vietnamese) } return hash diff --git a/stopwords.go b/stopwords.go index c4b52d0..985af71 100644 --- a/stopwords.go +++ b/stopwords.go @@ -23,8 +23,8 @@ import ( ) var ( - remTags = regexp.MustCompile(`<[^>]*>`) - oneSpace = regexp.MustCompile(`\s{2,}`) + remTags = regexp.MustCompile(`<[^>]*>`) + oneSpace = regexp.MustCompile(`\s{2,}`) wordSegmenter = regexp.MustCompile(`[\pL\p{Mc}\p{Mn}-_']+`) ) @@ -118,6 +118,8 @@ func Clean(content []byte, langCode string, cleanHTML bool) []byte { content = removeStopWords(content, thai) case "tr": content = removeStopWords(content, turkish) + case "vi": + content = removeStopWords(content, vietnamese) } //Remove duplicated space characters diff --git a/stopwords_vi.go b/stopwords_vi.go new file mode 100644 index 0000000..d2e6752 --- /dev/null +++ b/stopwords_vi.go @@ -0,0 +1,379 @@ +// Copyright (c) 2023, Cotch22. +// Use of this source code is governed by the BSD license +// license that can be found in the LICENSE file. + +package stopwords + +var vietnamese = map[string]string{ + "ai": "", + "alô": "", + "amen": "", + "anh": "", + "ba": "", + "biết": "", + "buổi": "", + "bà": "", + "bài": "", + "bác": "", + "bán": "", + "bèn": "", + "béng": "", + "bên": "", + "bông": "", + "bước": "", + "bạn": "", + "bản": "", + "bấy": "", + "bằng": "", + "bển": "", + "bệt": "", + "bị": "", + "bỏ": "", + "bỗng": "", + "bộ": "", + "bớ": "", + "bởi": "", + "bức": "", + "cao": "", + "cha": "", + "chiếc": "", + "cho": "", + "choa": "", + "chung": "", + "chuyển": "", + "chuyện": "", + "chính": "", + "chú": "", + "chúng": "", + "chăng": "", + "chơi": "", + "chưa": "", + "chậc": "", + "chắc": "", + "chỉ": "", + "chỉn": "", + "chị": "", + "chịu": "", + "chọn": "", + "chớ": "", + "chợt": "", + "chủn": "", + "chứ": "", + "con": "", + "cuối": "", + "cuốn": "", + "cuộc": "", + "càng": "", + "các": "", + "cách": "", + "cái": "", + "cây": "", + "còn": "", + "có": "", + "cô": "", + "cùng": "", + "căn": "", + "cũng": "", + "cơ": "", + "cơn": "", + "cả": "", + "cấp": "", + "cần": "", + "cậu": "", + "của": "", + "cứ": "", + "do": "", + "duy": "", + "dài": "", + "dành": "", + "dào": "", + "dì": "", + "dù": "", + "dùng": "", + "dưới": "", + "dạ": "", + "dẫn": "", + "dẫu": "", + "dễ": "", + "dữ": "", + "em": "", + "giảm": "", + "giống": "", + "giờ": "", + "giữ": "", + "giữa": "", + "gây": "", + "gì": "", + "gần": "", + "gặp": "", + "gồm": "", + "hay": "", + "hiểu": "", + "hoặc": "", + "hãy": "", + "hơn": "", + "hết": "", + "họ": "", + "hỏi": "", + "khi": "", + "khiến": "", + "khoảng": "", + "khá": "", + "khác": "", + "khách": "", + "khó": "", + "không": "", + "khỏi": "", + "kể": "", + "loại": "", + "luôn": "", + "là": "", + "làm": "", + "lâu": "", + "lên": "", + "lòng": "", + "lúc": "", + "lượng": "", + "lại": "", + "lấy": "", + "lần": "", + "lớn": "", + "lời": "", + "mang": "", + "muốn": "", + "mà": "", + "mình": "", + "mạnh": "", + "mất": "", + "mọi": "", + "mối": "", + "mỗi": "", + "một": "", + "mới": "", + "mở": "", + "mợ": "", + "mức": "", + "nay": "", + "ngay": "", + "nghe": "", + "nghen": "", + "nghĩ": "", + "nghỉm": "", + "ngoài": "", + "ngoải": "", + "nguồn": "", + "ngày": "", + "ngôi": "", + "ngươi": "", + "người": "", + "ngọn": "", + "ngọt": "", + "ngồi": "", + "nhanh": "", + "nhau": "", + "nhiều": "", + "nhà": "", + "nhé": "", + "nhìn": "", + "nhóm": "", + "như": "", + "nhưng": "", + "nhất": "", + "nhận": "", + "nhằm": "", + "nhỉ": "", + "nhỏ": "", + "nhớ": "", + "nhờ": "", + "những": "", + "nào": "", + "này": "", + "nên": "", + "nó": "", + "nóc": "", + "nói": "", + "năm": "", + "nơi": "", + "nước": "", + "nấy": "", + "nặng": "", + "nếu": "", + "nền": "", + "nọ": "", + "nớ": "", + "nữa": "", + "oái": "", + "pho": "", + "phè": "", + "phía": "", + "phóc": "", + "phót": "", + "phải": "", + "phần": "", + "phắt": "", + "phỏng": "", + "phốc": "", + "phụt": "", + "phứt": "", + "qua": "", + "quay": "", + "quá": "", + "quả": "", + "quận": "", + "ra": "", + "riêng": "", + "riệt": "", + "rày": "", + "ráo": "", + "rén": "", + "rích": "", + "rõ": "", + "răng": "", + "rất": "", + "rằng": "", + "rồi": "", + "rứa": "", + "sang": "", + "sao": "", + "sau": "", + "so": "", + "suýt": "", + "sáng": "", + "sì": "", + "sất": "", + "sắp": "", + "sẽ": "", + "số": "", + "sớm": "", + "sự": "", + "tanh": "", + "tay": "", + "thanh": "", + "theo": "", + "thiếu": "", + "thoạt": "", + "thoắt": "", + "thuần": "", + "thuộc": "", + "thà": "", + "tháng": "", + "thêm": "", + "thì": "", + "thích": "", + "thím": "", + "thôi": "", + "thường": "", + "thấp": "", + "thấy": "", + "thẩy": "", + "thậm": "", + "thật": "", + "thế": "", + "thếch": "", + "thỏm": "", + "thốc": "", + "thốt": "", + "thộc": "", + "thứ": "", + "thửa": "", + "tin": "", + "toà": "", + "toẹt": "", + "trong": "", + "tránh": "", + "trên": "", + "trước": "", + "trả": "", + "trển": "", + "trệt": "", + "trỏng": "", + "tuy": "", + "tuổi": "", + "tên": "", + "tênh": "", + "tìm": "", + "tính": "", + "tôi": "", + "tăng": "", + "tại": "", + "tạo": "", + "tấm": "", + "tấn": "", + "tắp": "", + "tọt": "", + "tốt": "", + "tột": "", + "tớ": "", + "tới": "", + "từ": "", + "từng": "", + "tự": "", + "veo": "", + "việc": "", + "và": "", + "vài": "", + "vào": "", + "vâng": "", + "vèo": "", + "vì": "", + "vùng": "", + "vượt": "", + "vẫn": "", + "vậy": "", + "về": "", + "với": "", + "vở": "", + "vụt": "", + "vừa": "", + "xa": "", + "xem": "", + "xin": "", + "xoét": "", + "xoẳn": "", + "xoẹt": "", + "xuể": "", + "xuống": "", + "xệp": "", + "à": "", + "ào": "", + "á": "", + "ái": "", + "áng": "", + "ít": "", + "ông": "", + "úi": "", + "ý": "", + "ăn": "", + "đang": "", + "điều": "", + "điểm": "", + "đáng": "", + "đâu": "", + "đây": "", + "đã": "", + "đó": "", + "đúng": "", + "đưa": "", + "được": "", + "đạt": "", + "đầy": "", + "đặt": "", + "đến": "", + "đều": "", + "để": "", + "đủ": "", + "ơ": "", + "ơi": "", + "ư": "", + "ạ": "", + "ấy": "", + "ắt": "", + "ồ": "", + "ổng": "", + "ớ": "", + "ờ": "", + "ở": "", + "ủa": "", + "ừ": "", + "ử": "", +} From 19c8cf40a2558b74d28babf095ce9c3a52b42729 Mon Sep 17 00:00:00 2001 From: Cotch22 Date: Thu, 20 Jul 2023 11:26:53 +0800 Subject: [PATCH 2/3] Update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b15967c..a44d075 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.idea/ coverage.out /nbproject/private/ /nbproject/ From ab218014fbff7a661ad4015daef36cb9fc7bd6ed Mon Sep 17 00:00:00 2001 From: Cotch22 Date: Thu, 20 Jul 2023 11:31:54 +0800 Subject: [PATCH 3/3] Update README --- README.md | 1 + stopwords.go | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9a26011..ea6f24f 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ It uses a curated list of the most frequent words used in these languages: * Swedish * Thai * Turkish + * Vietnamese If the function is used with an unsupported language, it doesn't fail, but will apply english filter to the content. diff --git a/stopwords.go b/stopwords.go index 985af71..d57437e 100644 --- a/stopwords.go +++ b/stopwords.go @@ -8,7 +8,8 @@ // // arabic, bulgarian, czech, danish, english, finnish, french, german, // hungarian, italian, japanese, latvian, norwegian, persian, polish, -// portuguese, romanian, russian, slovak, spanish, swedish, turkish +// portuguese, romanian, russian, slovak, spanish, swedish, turkish, +// vietnamese // Package stopwords contains various algorithms of text comparison (Simhash, Levenshtein) package stopwords