fix:voc_match (#327)

handle variations in accents/punctuation in the .voc files this is common in written chat and in some postprocessing applied to STTs example, ocp pipeline was not matching "noticias" vs "notícias" (News in portuguese)
OpenVoiceOS · Jan 30, 2025 · 43b1e20 · 43b1e20
1 parent 097ae6a
commit 43b1e20
Showing 1 changed file with 33 additions and 8 deletions.
diff --git a/ovos_workshop/skills/ovos.py b/ovos_workshop/skills/ovos.py
@@ -4,9 +4,11 @@
 import os
 import re
 import shutil
+import string
 import sys
 import time
 import traceback
+import unicodedata
 from copy import copy
 from hashlib import md5
 from inspect import signature
@@ -17,17 +19,19 @@
 
 from json_database import JsonStorage
 from langcodes import closest_match
+from ovos_config.config import Configuration
+from ovos_config.locations import get_xdg_cache_save_path
+from ovos_config.locations import get_xdg_config_save_path
+from ovos_number_parser import pronounce_number, extract_number
+from ovos_yes_no_solver import YesNoSolver
+
 from ovos_bus_client import MessageBusClient
 from ovos_bus_client.apis.enclosure import EnclosureAPI
 from ovos_bus_client.apis.gui import GUIInterface
 from ovos_bus_client.apis.ocp import OCPInterface
 from ovos_bus_client.message import Message, dig_for_message
 from ovos_bus_client.session import SessionManager, Session
 from ovos_bus_client.util import get_message_lang
-from ovos_config.config import Configuration
-from ovos_config.locations import get_xdg_cache_save_path
-from ovos_config.locations import get_xdg_config_save_path
-from ovos_number_parser import pronounce_number, extract_number
 from ovos_plugin_manager.language import OVOSLangTranslationFactory, OVOSLangDetectionFactory
 from ovos_utils import camel_case_split, classproperty
 from ovos_utils.dialog import MustacheDialogRenderer
@@ -42,9 +46,6 @@
 from ovos_utils.process_utils import ProcessStatus, StatusCallbackMap
 from ovos_utils.process_utils import RuntimeRequirements
 from ovos_utils.skills import get_non_properties
-from ovos_yes_no_solver import YesNoSolver
-from padacioso import IntentContainer
-
 from ovos_workshop.decorators.killable import AbortEvent, killable_event, \
     AbortQuestion
 from ovos_workshop.decorators.layers import IntentLayers
@@ -54,6 +55,7 @@
 from ovos_workshop.resource_files import ResourceFile, \
     CoreResources, find_resource, SkillResources
 from ovos_workshop.settings import PrivateSettings
+from padacioso import IntentContainer
 
 
 def simple_trace(stack_trace: List[str]) -> str:
@@ -2210,7 +2212,7 @@ def voc_list(self, voc_filename: str,
         return self._voc_cache.get(cache_key) or []
 
     def voc_match(self, utt: str, voc_filename: str, lang: Optional[str] = None,
-                  exact: bool = False):
+                  exact: bool = False, ensure_ascii=True):
         """
         Determine if the given utterance contains the vocabulary provided.
 
@@ -2229,6 +2231,7 @@ def voc_match(self, utt: str, voc_filename: str, lang: Optional[str] = None,
                                 'locale/en-us/cancel.voc')
             lang (str): Language code, defaults to self.lang
             exact (bool): Whether the vocab must exactly match the utterance
+            ensure_ascii (bool): Whether to ignore accents and punctuation
 
         Returns:
             bool: True if the utterance has the given vocabulary it
@@ -2243,6 +2246,10 @@ def voc_match(self, utt: str, voc_filename: str, lang: Optional[str] = None,
             return False
 
         if utt and _vocs:
+            if ensure_ascii:
+                utt = remove_accents_and_punct(utt)
+                _vocs = [remove_accents_and_punct(v) for v in _vocs]
+
             if exact:
                 # Check for exact match
                 match = any(i.strip().lower() == utt.lower()
@@ -2764,3 +2771,21 @@ def _join_word_list_es(items: List[str], connector: str, sep: str = ",") -> str:
             final_connector = "u"
 
     return f"{joined_string} {final_connector} {items[-1]}"
+
+# TODO - move to ovos-utils
+def remove_accents_and_punct(input_str: str) -> str:
+    """
+    Normalize the input string by removing accents and punctuation (except for '{' and '}').
+
+    Args:
+        input_str (str): The input string to be processed.
+
+    Returns:
+        str: The processed string with accents and punctuation removed.
+    """
+    rm_chars = [c for c in string.punctuation if c not in ("{", "}")]
+    # Normalize to NFD (Normalization Form Decomposed), which separates characters and diacritical marks
+    nfkd_form = unicodedata.normalize('NFD', input_str)
+    # Remove characters that are not ASCII letters or punctuation we want to keep
+    return ''.join([char for char in nfkd_form
+                    if unicodedata.category(char) != 'Mn' and char not in rm_chars])