From 73944feb6a8c79eb158cae8338fdabdd36570e44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szab=C3=B3=20Gergely?= Date: Fri, 18 Nov 2016 21:59:40 +0100 Subject: [PATCH] Update SimpleTokenizer.java A character encoding problem occurred when I cloned this repo. I would recommend to use the unicode escapes instead of the unicode chars. --- .../cmu/sphinx/alignment/SimpleTokenizer.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sphinx4-core/src/main/java/edu/cmu/sphinx/alignment/SimpleTokenizer.java b/sphinx4-core/src/main/java/edu/cmu/sphinx/alignment/SimpleTokenizer.java index f0bfb6541..1df5f57b0 100644 --- a/sphinx4-core/src/main/java/edu/cmu/sphinx/alignment/SimpleTokenizer.java +++ b/sphinx4-core/src/main/java/edu/cmu/sphinx/alignment/SimpleTokenizer.java @@ -15,16 +15,16 @@ public class SimpleTokenizer implements TextTokenizer { public List expand(String text) { - text = text.replace('’', '\''); - text = text.replace('‘', ' '); - text = text.replace('”', ' '); - text = text.replace('“', ' '); - text = text.replace('"', ' '); - text = text.replace('»', ' '); - text = text.replace('«', ' '); - text = text.replace('–', '-'); - text = text.replace('—', ' '); - text = text.replace('…', ' '); + text = text.replace('\u2019', '\''); + text = text.replace('\u2018', ' '); + text = text.replace('\u201D', ' '); + text = text.replace('\u201C', ' '); + text = text.replace('\u0022', ' '); + text = text.replace('\u00BB', ' '); + text = text.replace('\u00AB', ' '); + text = text.replace('\u2013', '-'); + text = text.replace('\u2014', ' '); + text = text.replace('\u2026', ' '); text = text.replace(" - ", " "); text = text.replaceAll("[/_*%]", " ");