Added abbreviation handling to Idlak

Skaiste · Jun 6, 2018 · 0dcf922 · 0dcf922
1 parent dd152b9
commit 0dcf922
Show file tree

Hide file tree

Showing 45 changed files with 2,331 additions and 2,140 deletions.
diff --git a/idlak-data/en/nrules-default.xml b/idlak-data/en/nrules-default.xml
@@ -8,7 +8,7 @@
     to deal with hypenation.
   </comment>
   <exp>
-    <![CDATA[^([ \n\t\r\-]+)]]>
+    <![CDATA[^([ \n\t\r]+)]]>
   </exp>
 </regex>
 

diff --git a/src/idlaktxp/Makefile b/src/idlaktxp/Makefile
@@ -11,7 +11,7 @@ TESTFILES = idlak-mod-test
 
 OBJFILES = txpxmldata.o txputf8.o txppcre.o txpnrules.o txppos.o \
 	   txppbreak.o txpsylmax.o txplexicon.o txplts.o txpmodule.o txpcexspec.o \
-           txpparse-options.o \
+           txpparse-options.o txpabbrev.o \
 	   cexfunctions.o cexfunctionscatalog.o mod-tokenise.o \
 	   mod-postag.o mod-pauses.o mod-phrasing.o mod-pronounce.o mod-syllabify.o mod-cex.o
 

diff --git a/src/idlaktxp/mod-tokenise.cc b/src/idlaktxp/mod-tokenise.cc
@@ -26,15 +26,17 @@ bool TxpTokenise::Init(const TxpParseOptions &opts) {
   opts_ = &opts;
   tpdb_ = opts.GetTpdb();
   nrules_.Init(opts, std::string(opts_->GetValue(GetName().c_str(), "arch")));
-  return nrules_.Parse(tpdb_);
+  abbrev_.Init(opts, std::string(GetOptValue("arch")));
+  return nrules_.Parse(tpdb_) && abbrev_.Parse(tpdb_);
 }
 
 bool TxpTokenise::Process(pugi::xml_document* input) {
   const char* p;
   int32 n = 0;
   int32 offset, col = 0;
   std::string token, wspace, tmp;
-  pugi::xml_node tkroot, tk, ws, ntxt;
+  pugi::xml_node tkroot, tk, tkcopy, ws, ntxt, lex;
+  TxpAbbrevInfo * abbrev_info;
   // all text nodes are tokenised
   pugi::xpath_node_set text =
       input->document_element().select_nodes("//text()");
@@ -55,7 +57,41 @@ bool TxpTokenise::Process(pugi::xml_document* input) {
         ntxt.set_value(token.c_str());
         n += 1;
         nrules_.ReplaceUtf8Punc(token, &tmp);
-        SetPuncCaseInfo(&tmp, &tk);
+        /// check for full token matches without partial punctuation
+        /// i.e. :-) but not (US) 
+        abbrev_info = abbrev_.LookupAbbrev(token.c_str());
+        if (abbrev_info) {
+          for(int32 i = 0; i < abbrev_info->expansions.size(); i++) {
+            if (!i) {
+              tk.append_attribute("norm");
+              tk.attribute("norm").set_value(abbrev_info->expansions[0].c_str());
+              if (!abbrev_info->lexentries[0].empty()) {
+                lex = tk.parent().insert_child_after("lex", tk);
+                lex.append_attribute("type").set_value(abbrev_info->lexentries[0].c_str());
+                tkcopy = lex.append_copy(tk);
+                tk.parent().remove_child(tk);
+                tk = tkcopy;
+              }
+            }
+            else {
+              if  (!abbrev_info->lexentries[i].empty()) {
+                lex = tk.parent().insert_child_after("lex", tk);
+                lex.append_attribute("type").set_value(abbrev_info->lexentries[i].c_str());
+                tk = lex.append_child("tk");
+                tk.append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
+                tk = lex; //so we add new tokens after the lex tag not inside it
+              }
+              else {
+                tk = tk.parent().insert_child_after("tk", tk);
+                tk.append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
+              }
+            }
+          }
+        }
+        // if no full match of an abbreviation unpack punctuation
+        else {
+          SetPuncCaseInfo(&tmp, &tk);
+        }
       }
       if (wspace.length()) {
         if (token.length())
@@ -88,35 +124,82 @@ bool TxpTokenise::Process(pugi::xml_document* input) {
 int32 TxpTokenise::SetPuncCaseInfo(std::string* tkin, pugi::xml_node* tk) {
   const char* p;
   TxpCaseInfo caseinfo;
-  pugi::xml_node node;
+  pugi::xml_node node, lex, tkcopy;
   int32 n = 0;
   std::string token;
   std::string prepunc;
   std::string pstpunc;
+  TxpAbbrevInfo * abbrev_info;
   p = tkin->c_str();
   while (*p) {
     p = nrules_.ConsumePunc(p, &prepunc, &token, &pstpunc);
     if (n) {
       *tk = tk->parent().insert_child_after("tk", *tk);
     }
-    if (prepunc.length()) {
-      tk->append_attribute("prepunc");
-      tk->attribute("prepunc").set_value(prepunc.c_str());
-    }
-    if (token.length()) {
-      tk->append_attribute("norm");
-      nrules_.NormCaseCharacter(&token, caseinfo);
-      tk->attribute("norm").set_value(token.c_str());
+    // check to see if there is an abbreviation that matches the token and
+    // completely or partially matches the punctuation
+    abbrev_info = abbrev_.LookupAbbrev(token.c_str(), prepunc.c_str(), pstpunc.c_str());
+    if (abbrev_info) {
+      // trim punctuation as appropriate
+      prepunc = prepunc.substr(0, prepunc.size() - abbrev_.CheckPrePunc(prepunc.c_str(), abbrev_info));
+      pstpunc = pstpunc.substr(abbrev_.CheckPstPunc(pstpunc.c_str(), abbrev_info),
+                                                    pstpunc.size() - abbrev_.CheckPstPunc(pstpunc.c_str(),
+                                                                                          abbrev_info));
+      for(int32 i = 0; i < abbrev_info->expansions.size(); i++) {
+        if (!i) {
+          tk->append_attribute("norm");
+          tk->attribute("norm").set_value(abbrev_info->expansions[0].c_str());
+          if (!abbrev_info->lexentries[0].empty()) {
+            lex = tk->parent().insert_child_after("lex", *tk);
+            lex.append_attribute("type").set_value(abbrev_info->lexentries[0].c_str());
+            tkcopy = lex.append_copy(*tk);
+            tk->parent().remove_child(*tk);
+            *tk = tkcopy;
+          }
+          if (prepunc.length()) {
+            tk->append_attribute("prepunc");
+            tk->attribute("prepunc").set_value(prepunc.c_str());
+          }
+        }
+        else {
+          if  (!abbrev_info->lexentries[i].empty()) {
+            lex = tk->parent().insert_child_after("lex", *tk);
+            lex.append_attribute("type").set_value(abbrev_info->lexentries[i].c_str());
+            *tk = lex.append_child("tk");
+            tk->append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
+            *tk = lex;
+          }
+          else {
+            *tk = tk->parent().insert_child_after("tk", *tk);
+            tk->append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
+          }
+        }
+      }
+      if (pstpunc.length()) {
+        tk->append_attribute("pstpunc");
+        tk->attribute("pstpunc").set_value(pstpunc.c_str());
+      }
     }
-    if (pstpunc.length()) {
-      tk->append_attribute("pstpunc");
-      tk->attribute("pstpunc").set_value(pstpunc.c_str());
+    else {
+      if (prepunc.length()) {
+        tk->append_attribute("prepunc");
+        tk->attribute("prepunc").set_value(prepunc.c_str());
+      }
+      if (token.length()) {
+        tk->append_attribute("norm");
+        nrules_.NormCaseCharacter(&token, caseinfo);
+        tk->attribute("norm").set_value(token.c_str());
+      }
+      if (pstpunc.length()) {
+        tk->append_attribute("pstpunc");
+        tk->attribute("pstpunc").set_value(pstpunc.c_str());
+      }
+      if (caseinfo.lowercase) tk->append_attribute("lc").set_value("true");
+      if (caseinfo.uppercase) tk->append_attribute("uc").set_value("true");
+      if (caseinfo.foreign) tk->append_attribute("foreign").set_value("true");
+      if (caseinfo.symbols) tk->append_attribute("symbols").set_value("true");
+      if (caseinfo.capitalised) tk->append_attribute("caps").set_value("true");
     }
-    if (caseinfo.lowercase) tk->append_attribute("lc").set_value("true");
-    if (caseinfo.uppercase) tk->append_attribute("uc").set_value("true");
-    if (caseinfo.foreign) tk->append_attribute("foreign").set_value("true");
-    if (caseinfo.symbols) tk->append_attribute("symbols").set_value("true");
-    if (caseinfo.capitalised) tk->append_attribute("caps").set_value("true");
     n++;
   }
   return true;

diff --git a/src/idlaktxp/mod-tokenise.h b/src/idlaktxp/mod-tokenise.h
@@ -26,6 +26,7 @@
 #include <string>
 #include "idlaktxp/txpmodule.h"
 #include "idlaktxp/txpnrules.h"
+#include "idlaktxp/txpabbrev.h"
 
 namespace kaldi {
 
@@ -46,6 +47,10 @@ class TxpTokenise : public TxpModule {
   /// Currently this data will be loaded muliple times across
   /// multiple modules
   TxpNRules nrules_;
+  /// Abbreviation data. This overides normalisation by directly
+  /// replacing matching tokens with normalised results
+  /// i.e. Dr. -> doctor
+  TxpAbbrev abbrev_;
 };
 
 }  // namespace kaldi

diff --git a/src/idlaktxp/test_data/mod-cex-out000.xml b/src/idlaktxp/test_data/mod-cex-out000.xml
diff --git a/src/idlaktxp/test_data/mod-cex-out001.xml b/src/idlaktxp/test_data/mod-cex-out001.xml
diff --git a/src/idlaktxp/test_data/mod-cex-out002.xml b/src/idlaktxp/test_data/mod-cex-out002.xml
@@ -1,3 +1,3 @@
 <?xml version="1.0"?><parent><txpheader><cex><cexfunction name="BackwardBackwardPhone" delim="^" isinteger="0" /><cexfunction name="BackwardPhone" delim="~" isinteger="0" /><cexfunction name="Phone" delim="-" isinteger="0" /><cexfunction name="ForwardPhone" delim="+" isinteger="0" /><cexfunction name="ForwardForwardPhone" delim="=" isinteger="0" /><cexfunction name="SegmentLocationFromFront" delim=" " isinteger="1" /><cexfunction name="SegmentLocationFromBack" delim=" " isinteger="1" /><cexfunction name="BackwardSyllableStress" delim=" " isinteger="1" /><cexfunction name="SyllableStress" delim=" " isinteger="1" /><cexfunction name="ForwardSyllableStress" delim=" " isinteger="1" /><cexfunction name="BackwardWordPosTag" delim=" " isinteger="0" /><cexfunction name="WordPosTag" delim=" " isinteger="0" /><cexfunction name="ForwardWordPosTag" delim=" " isinteger="0" /><cexfunction name="BackwardSyllableNumPhones" delim=" " isinteger="1" /><cexfunction name="SyllableNumPhones" delim=" " isinteger="1" /><cexfunction name="ForwardSyllableNumPhones" delim=" " isinteger="1" /><cexfunction name="BackwardWordNumSyls" delim=" " isinteger="1" /><cexfunction name="WordNumSyls" delim=" " isinteger="1" /><cexfunction name="ForwardWordNumSyls" delim=" " isinteger="1" /><cexfunction name="PhraseNumWords" delim=" " isinteger="1" /><cexfunction name="PhraseTobiEndTone" delim=" " isinteger="0" /></cex></txpheader><utt uttid="1" no_phrases="2"><spt phraseid="1" no_wrds="2"><ws col="0">
-</ws><break type="4" time="0.011"><tk pron="pau"><syl val="pau"><phon val="pau">^0~0-pau+hh=ah 0 0 0 0 0 PAU PAU NN 0 0 2 0 0 2 2 LL</phon></syl></tk></break><tk norm="hello" lc="true" uc="true" pos="NN" posset="1" wordid="1" pron="hh ah0 l ow1" altprons="hh eh0 l ow1, hh ah0 l ow1" spron="hh+ah0|l+ow1|" nosyl="2">Hello<ws> </ws><syl val="hh+ah0" stress="0" sylid="1" nophons="2"><phon val="hh" type="onset" phonid="1">^0~pau-hh+ah=l 0 1 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LL</phon><phon val="ah" type="nucleus" phonid="2">^pau~hh-ah+l=ow 1 0 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LL</phon></syl><syl val="l+ow1" stress="1" sylid="2" nophons="2"><phon val="l" type="onset" phonid="1">^hh~ah-l+ow=dh 0 1 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LL</phon><phon val="ow" type="nucleus" phonid="2">^ah~l-ow+dh=eh 1 0 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LL</phon></syl></tk><tk norm="there" pstpunc="," lc="true" pos="EX" posset="3" wordid="2" pron="dh eh1 r" spron="dh+eh1+r|" nosyl="1">there,<ws> </ws><syl val="dh+eh1+r" stress="1" sylid="1" nophons="3"><phon val="dh" type="onset" phonid="1">^l~ow-dh+eh=r 0 2 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LL</phon><phon val="eh" type="nucleus" phonid="2">^ow~dh-eh+r=pau 1 1 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LL</phon><phon val="r" type="coda" phonid="3">^dh~eh-r+pau=pau 2 0 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LL</phon></syl></tk><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^eh~r-pau+pau=0 0 0 1 0 0 EX PAU PAU 3 0 0 1 0 0 2 LL</phon></syl></tk></break></spt><spt phraseid="2" no_wrds="3"><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^0~pau-pau+w=ah 0 0 0 0 1 PAU PAU NN 0 0 3 0 0 1 3 LH</phon></syl></tk></break><tk norm="1" symbols="true" pos="NN" posset="1" wordid="1" pron="w ah1 n" spron="w+ah1+n|" nosyl="1">1<ws> </ws><syl val="w+ah1+n" stress="1" sylid="1" nophons="3"><phon val="w" type="onset" phonid="1">^pau~pau-w+ah=n 0 2 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LH</phon><phon val="ah" type="nucleus" phonid="2">^pau~w-ah+n=t 1 1 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LH</phon><phon val="n" type="coda" phonid="3">^w~ah-n+t=uw 2 0 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LH</phon></syl></tk><tk norm="2" symbols="true" pos="NN" posset="1" wordid="2" pron="t uw1" spron="t+uw1|" nosyl="1">2<ws> </ws><syl val="t+uw1" stress="1" sylid="1" nophons="2"><phon val="t" type="onset" phonid="1">^ah~n-t+uw=th 0 1 1 1 1 NN NN NN 3 2 3 1 1 1 3 LH</phon><phon val="uw" type="nucleus" phonid="2">^n~t-uw+th=r 1 0 1 1 1 NN NN NN 3 2 3 1 1 1 3 LH</phon></syl></tk><tk norm="3" pstpunc="." symbols="true" pos="NN" posset="1" wordid="3" pron="th r iy1" spron="th_r+iy1|" nosyl="1">3.<ws col="20">
-</ws><syl val="th_r+iy1" stress="1" sylid="1" nophons="3"><phon val="th" type="onset" phonid="1">^t~uw-th+r=iy 0 2 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LH</phon><phon val="r" type="onset" phonid="2">^uw~th-r+iy=pau 1 1 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LH</phon><phon val="iy" type="nucleus" phonid="3">^th~r-iy+pau=0 2 0 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LH</phon></syl></tk><break type="4" time="0.4"><tk pron="pau"><syl val="pau"><phon val="pau">^r~iy-pau+0=0 0 0 1 0 0 NN PAU PAU 3 0 0 1 0 0 3 LH</phon></syl></tk></break></spt></utt></parent>
+</ws><break type="4" time="0.011"><tk pron="pau"><syl val="pau"><phon val="pau">^0~0-pau+hh=ah 0 0 0 0 0 PAU PAU NN 0 0 2 0 0 2 2 LH</phon></syl></tk></break><tk norm="hello" lc="true" uc="true" pos="NN" posset="1" wordid="1" pron="hh ah0 l ow1" altprons="hh eh0 l ow1, hh ah0 l ow1" spron="hh+ah0|l+ow1|" nosyl="2">Hello<ws> </ws><syl val="hh+ah0" stress="0" sylid="1" nophons="2"><phon val="hh" type="onset" phonid="1">^0~pau-hh+ah=l 0 1 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LH</phon><phon val="ah" type="nucleus" phonid="2">^pau~hh-ah+l=ow 1 0 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LH</phon></syl><syl val="l+ow1" stress="1" sylid="2" nophons="2"><phon val="l" type="onset" phonid="1">^hh~ah-l+ow=dh 0 1 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LH</phon><phon val="ow" type="nucleus" phonid="2">^ah~l-ow+dh=eh 1 0 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LH</phon></syl></tk><tk norm="there" pstpunc="," lc="true" pos="EX" posset="3" wordid="2" pron="dh eh1 r" spron="dh+eh1+r|" nosyl="1">there,<ws> </ws><syl val="dh+eh1+r" stress="1" sylid="1" nophons="3"><phon val="dh" type="onset" phonid="1">^l~ow-dh+eh=r 0 2 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LH</phon><phon val="eh" type="nucleus" phonid="2">^ow~dh-eh+r=pau 1 1 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LH</phon><phon val="r" type="coda" phonid="3">^dh~eh-r+pau=pau 2 0 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LH</phon></syl></tk><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^eh~r-pau+pau=0 0 0 1 0 0 EX PAU PAU 3 0 0 1 0 0 2 LH</phon></syl></tk></break></spt><spt phraseid="2" no_wrds="3"><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^0~pau-pau+w=ah 0 0 0 0 1 PAU PAU NN 0 0 3 0 0 1 3 LL</phon></syl></tk></break><tk norm="1" symbols="true" pos="NN" posset="1" wordid="1" pron="w ah1 n" spron="w+ah1+n|" nosyl="1">1<ws> </ws><syl val="w+ah1+n" stress="1" sylid="1" nophons="3"><phon val="w" type="onset" phonid="1">^pau~pau-w+ah=n 0 2 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LL</phon><phon val="ah" type="nucleus" phonid="2">^pau~w-ah+n=t 1 1 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LL</phon><phon val="n" type="coda" phonid="3">^w~ah-n+t=uw 2 0 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LL</phon></syl></tk><tk norm="2" symbols="true" pos="NN" posset="1" wordid="2" pron="t uw1" spron="t+uw1|" nosyl="1">2<ws> </ws><syl val="t+uw1" stress="1" sylid="1" nophons="2"><phon val="t" type="onset" phonid="1">^ah~n-t+uw=th 0 1 1 1 1 NN NN NN 3 2 3 1 1 1 3 LL</phon><phon val="uw" type="nucleus" phonid="2">^n~t-uw+th=r 1 0 1 1 1 NN NN NN 3 2 3 1 1 1 3 LL</phon></syl></tk><tk norm="3" pstpunc="." symbols="true" pos="NN" posset="1" wordid="3" pron="th r iy1" spron="th_r+iy1|" nosyl="1">3.<ws col="20">
+</ws><syl val="th_r+iy1" stress="1" sylid="1" nophons="3"><phon val="th" type="onset" phonid="1">^t~uw-th+r=iy 0 2 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LL</phon><phon val="r" type="onset" phonid="2">^uw~th-r+iy=pau 1 1 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LL</phon><phon val="iy" type="nucleus" phonid="3">^th~r-iy+pau=0 2 0 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LL</phon></syl></tk><break type="4" time="0.4"><tk pron="pau"><syl val="pau"><phon val="pau">^r~iy-pau+0=0 0 0 1 0 0 NN PAU PAU 3 0 0 1 0 0 3 LL</phon></syl></tk></break></spt></utt></parent>