Skip to content

Commit

Permalink
Added abbreviation handling to Idlak
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewaylett committed Jun 6, 2018
1 parent dd152b9 commit 0dcf922
Show file tree
Hide file tree
Showing 45 changed files with 2,331 additions and 2,140 deletions.
2 changes: 1 addition & 1 deletion idlak-data/en/nrules-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
to deal with hypenation.
</comment>
<exp>
<![CDATA[^([ \n\t\r\-]+)]]>
<![CDATA[^([ \n\t\r]+)]]>
</exp>
</regex>

Expand Down
2 changes: 1 addition & 1 deletion src/idlaktxp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ TESTFILES = idlak-mod-test

OBJFILES = txpxmldata.o txputf8.o txppcre.o txpnrules.o txppos.o \
txppbreak.o txpsylmax.o txplexicon.o txplts.o txpmodule.o txpcexspec.o \
txpparse-options.o \
txpparse-options.o txpabbrev.o \
cexfunctions.o cexfunctionscatalog.o mod-tokenise.o \
mod-postag.o mod-pauses.o mod-phrasing.o mod-pronounce.o mod-syllabify.o mod-cex.o

Expand Down
123 changes: 103 additions & 20 deletions src/idlaktxp/mod-tokenise.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,17 @@ bool TxpTokenise::Init(const TxpParseOptions &opts) {
opts_ = &opts;
tpdb_ = opts.GetTpdb();
nrules_.Init(opts, std::string(opts_->GetValue(GetName().c_str(), "arch")));
return nrules_.Parse(tpdb_);
abbrev_.Init(opts, std::string(GetOptValue("arch")));
return nrules_.Parse(tpdb_) && abbrev_.Parse(tpdb_);
}

bool TxpTokenise::Process(pugi::xml_document* input) {
const char* p;
int32 n = 0;
int32 offset, col = 0;
std::string token, wspace, tmp;
pugi::xml_node tkroot, tk, ws, ntxt;
pugi::xml_node tkroot, tk, tkcopy, ws, ntxt, lex;
TxpAbbrevInfo * abbrev_info;
// all text nodes are tokenised
pugi::xpath_node_set text =
input->document_element().select_nodes("//text()");
Expand All @@ -55,7 +57,41 @@ bool TxpTokenise::Process(pugi::xml_document* input) {
ntxt.set_value(token.c_str());
n += 1;
nrules_.ReplaceUtf8Punc(token, &tmp);
SetPuncCaseInfo(&tmp, &tk);
/// check for full token matches without partial punctuation
/// i.e. :-) but not (US)
abbrev_info = abbrev_.LookupAbbrev(token.c_str());
if (abbrev_info) {
for(int32 i = 0; i < abbrev_info->expansions.size(); i++) {
if (!i) {
tk.append_attribute("norm");
tk.attribute("norm").set_value(abbrev_info->expansions[0].c_str());
if (!abbrev_info->lexentries[0].empty()) {
lex = tk.parent().insert_child_after("lex", tk);
lex.append_attribute("type").set_value(abbrev_info->lexentries[0].c_str());
tkcopy = lex.append_copy(tk);
tk.parent().remove_child(tk);
tk = tkcopy;
}
}
else {
if (!abbrev_info->lexentries[i].empty()) {
lex = tk.parent().insert_child_after("lex", tk);
lex.append_attribute("type").set_value(abbrev_info->lexentries[i].c_str());
tk = lex.append_child("tk");
tk.append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
tk = lex; //so we add new tokens after the lex tag not inside it
}
else {
tk = tk.parent().insert_child_after("tk", tk);
tk.append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
}
}
}
}
// if no full match of an abbreviation unpack punctuation
else {
SetPuncCaseInfo(&tmp, &tk);
}
}
if (wspace.length()) {
if (token.length())
Expand Down Expand Up @@ -88,35 +124,82 @@ bool TxpTokenise::Process(pugi::xml_document* input) {
int32 TxpTokenise::SetPuncCaseInfo(std::string* tkin, pugi::xml_node* tk) {
const char* p;
TxpCaseInfo caseinfo;
pugi::xml_node node;
pugi::xml_node node, lex, tkcopy;
int32 n = 0;
std::string token;
std::string prepunc;
std::string pstpunc;
TxpAbbrevInfo * abbrev_info;
p = tkin->c_str();
while (*p) {
p = nrules_.ConsumePunc(p, &prepunc, &token, &pstpunc);
if (n) {
*tk = tk->parent().insert_child_after("tk", *tk);
}
if (prepunc.length()) {
tk->append_attribute("prepunc");
tk->attribute("prepunc").set_value(prepunc.c_str());
}
if (token.length()) {
tk->append_attribute("norm");
nrules_.NormCaseCharacter(&token, caseinfo);
tk->attribute("norm").set_value(token.c_str());
// check to see if there is an abbreviation that matches the token and
// completely or partially matches the punctuation
abbrev_info = abbrev_.LookupAbbrev(token.c_str(), prepunc.c_str(), pstpunc.c_str());
if (abbrev_info) {
// trim punctuation as appropriate
prepunc = prepunc.substr(0, prepunc.size() - abbrev_.CheckPrePunc(prepunc.c_str(), abbrev_info));
pstpunc = pstpunc.substr(abbrev_.CheckPstPunc(pstpunc.c_str(), abbrev_info),
pstpunc.size() - abbrev_.CheckPstPunc(pstpunc.c_str(),
abbrev_info));
for(int32 i = 0; i < abbrev_info->expansions.size(); i++) {
if (!i) {
tk->append_attribute("norm");
tk->attribute("norm").set_value(abbrev_info->expansions[0].c_str());
if (!abbrev_info->lexentries[0].empty()) {
lex = tk->parent().insert_child_after("lex", *tk);
lex.append_attribute("type").set_value(abbrev_info->lexentries[0].c_str());
tkcopy = lex.append_copy(*tk);
tk->parent().remove_child(*tk);
*tk = tkcopy;
}
if (prepunc.length()) {
tk->append_attribute("prepunc");
tk->attribute("prepunc").set_value(prepunc.c_str());
}
}
else {
if (!abbrev_info->lexentries[i].empty()) {
lex = tk->parent().insert_child_after("lex", *tk);
lex.append_attribute("type").set_value(abbrev_info->lexentries[i].c_str());
*tk = lex.append_child("tk");
tk->append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
*tk = lex;
}
else {
*tk = tk->parent().insert_child_after("tk", *tk);
tk->append_attribute("norm").set_value(abbrev_info->expansions[i].c_str());
}
}
}
if (pstpunc.length()) {
tk->append_attribute("pstpunc");
tk->attribute("pstpunc").set_value(pstpunc.c_str());
}
}
if (pstpunc.length()) {
tk->append_attribute("pstpunc");
tk->attribute("pstpunc").set_value(pstpunc.c_str());
else {
if (prepunc.length()) {
tk->append_attribute("prepunc");
tk->attribute("prepunc").set_value(prepunc.c_str());
}
if (token.length()) {
tk->append_attribute("norm");
nrules_.NormCaseCharacter(&token, caseinfo);
tk->attribute("norm").set_value(token.c_str());
}
if (pstpunc.length()) {
tk->append_attribute("pstpunc");
tk->attribute("pstpunc").set_value(pstpunc.c_str());
}
if (caseinfo.lowercase) tk->append_attribute("lc").set_value("true");
if (caseinfo.uppercase) tk->append_attribute("uc").set_value("true");
if (caseinfo.foreign) tk->append_attribute("foreign").set_value("true");
if (caseinfo.symbols) tk->append_attribute("symbols").set_value("true");
if (caseinfo.capitalised) tk->append_attribute("caps").set_value("true");
}
if (caseinfo.lowercase) tk->append_attribute("lc").set_value("true");
if (caseinfo.uppercase) tk->append_attribute("uc").set_value("true");
if (caseinfo.foreign) tk->append_attribute("foreign").set_value("true");
if (caseinfo.symbols) tk->append_attribute("symbols").set_value("true");
if (caseinfo.capitalised) tk->append_attribute("caps").set_value("true");
n++;
}
return true;
Expand Down
5 changes: 5 additions & 0 deletions src/idlaktxp/mod-tokenise.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <string>
#include "idlaktxp/txpmodule.h"
#include "idlaktxp/txpnrules.h"
#include "idlaktxp/txpabbrev.h"

namespace kaldi {

Expand All @@ -46,6 +47,10 @@ class TxpTokenise : public TxpModule {
/// Currently this data will be loaded muliple times across
/// multiple modules
TxpNRules nrules_;
/// Abbreviation data. This overides normalisation by directly
/// replacing matching tokens with normalised results
/// i.e. Dr. -> doctor
TxpAbbrev abbrev_;
};

} // namespace kaldi
Expand Down
6 changes: 3 additions & 3 deletions src/idlaktxp/test_data/mod-cex-out000.xml

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions src/idlaktxp/test_data/mod-cex-out001.xml

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/idlaktxp/test_data/mod-cex-out002.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
<?xml version="1.0"?><parent><txpheader><cex><cexfunction name="BackwardBackwardPhone" delim="^" isinteger="0" /><cexfunction name="BackwardPhone" delim="~" isinteger="0" /><cexfunction name="Phone" delim="-" isinteger="0" /><cexfunction name="ForwardPhone" delim="+" isinteger="0" /><cexfunction name="ForwardForwardPhone" delim="=" isinteger="0" /><cexfunction name="SegmentLocationFromFront" delim=" " isinteger="1" /><cexfunction name="SegmentLocationFromBack" delim=" " isinteger="1" /><cexfunction name="BackwardSyllableStress" delim=" " isinteger="1" /><cexfunction name="SyllableStress" delim=" " isinteger="1" /><cexfunction name="ForwardSyllableStress" delim=" " isinteger="1" /><cexfunction name="BackwardWordPosTag" delim=" " isinteger="0" /><cexfunction name="WordPosTag" delim=" " isinteger="0" /><cexfunction name="ForwardWordPosTag" delim=" " isinteger="0" /><cexfunction name="BackwardSyllableNumPhones" delim=" " isinteger="1" /><cexfunction name="SyllableNumPhones" delim=" " isinteger="1" /><cexfunction name="ForwardSyllableNumPhones" delim=" " isinteger="1" /><cexfunction name="BackwardWordNumSyls" delim=" " isinteger="1" /><cexfunction name="WordNumSyls" delim=" " isinteger="1" /><cexfunction name="ForwardWordNumSyls" delim=" " isinteger="1" /><cexfunction name="PhraseNumWords" delim=" " isinteger="1" /><cexfunction name="PhraseTobiEndTone" delim=" " isinteger="0" /></cex></txpheader><utt uttid="1" no_phrases="2"><spt phraseid="1" no_wrds="2"><ws col="0">
</ws><break type="4" time="0.011"><tk pron="pau"><syl val="pau"><phon val="pau">^0~0-pau+hh=ah 0 0 0 0 0 PAU PAU NN 0 0 2 0 0 2 2 LL</phon></syl></tk></break><tk norm="hello" lc="true" uc="true" pos="NN" posset="1" wordid="1" pron="hh ah0 l ow1" altprons="hh eh0 l ow1, hh ah0 l ow1" spron="hh+ah0|l+ow1|" nosyl="2">Hello<ws> </ws><syl val="hh+ah0" stress="0" sylid="1" nophons="2"><phon val="hh" type="onset" phonid="1">^0~pau-hh+ah=l 0 1 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LL</phon><phon val="ah" type="nucleus" phonid="2">^pau~hh-ah+l=ow 1 0 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LL</phon></syl><syl val="l+ow1" stress="1" sylid="2" nophons="2"><phon val="l" type="onset" phonid="1">^hh~ah-l+ow=dh 0 1 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LL</phon><phon val="ow" type="nucleus" phonid="2">^ah~l-ow+dh=eh 1 0 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LL</phon></syl></tk><tk norm="there" pstpunc="," lc="true" pos="EX" posset="3" wordid="2" pron="dh eh1 r" spron="dh+eh1+r|" nosyl="1">there,<ws> </ws><syl val="dh+eh1+r" stress="1" sylid="1" nophons="3"><phon val="dh" type="onset" phonid="1">^l~ow-dh+eh=r 0 2 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LL</phon><phon val="eh" type="nucleus" phonid="2">^ow~dh-eh+r=pau 1 1 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LL</phon><phon val="r" type="coda" phonid="3">^dh~eh-r+pau=pau 2 0 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LL</phon></syl></tk><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^eh~r-pau+pau=0 0 0 1 0 0 EX PAU PAU 3 0 0 1 0 0 2 LL</phon></syl></tk></break></spt><spt phraseid="2" no_wrds="3"><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^0~pau-pau+w=ah 0 0 0 0 1 PAU PAU NN 0 0 3 0 0 1 3 LH</phon></syl></tk></break><tk norm="1" symbols="true" pos="NN" posset="1" wordid="1" pron="w ah1 n" spron="w+ah1+n|" nosyl="1">1<ws> </ws><syl val="w+ah1+n" stress="1" sylid="1" nophons="3"><phon val="w" type="onset" phonid="1">^pau~pau-w+ah=n 0 2 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LH</phon><phon val="ah" type="nucleus" phonid="2">^pau~w-ah+n=t 1 1 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LH</phon><phon val="n" type="coda" phonid="3">^w~ah-n+t=uw 2 0 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LH</phon></syl></tk><tk norm="2" symbols="true" pos="NN" posset="1" wordid="2" pron="t uw1" spron="t+uw1|" nosyl="1">2<ws> </ws><syl val="t+uw1" stress="1" sylid="1" nophons="2"><phon val="t" type="onset" phonid="1">^ah~n-t+uw=th 0 1 1 1 1 NN NN NN 3 2 3 1 1 1 3 LH</phon><phon val="uw" type="nucleus" phonid="2">^n~t-uw+th=r 1 0 1 1 1 NN NN NN 3 2 3 1 1 1 3 LH</phon></syl></tk><tk norm="3" pstpunc="." symbols="true" pos="NN" posset="1" wordid="3" pron="th r iy1" spron="th_r+iy1|" nosyl="1">3.<ws col="20">
</ws><syl val="th_r+iy1" stress="1" sylid="1" nophons="3"><phon val="th" type="onset" phonid="1">^t~uw-th+r=iy 0 2 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LH</phon><phon val="r" type="onset" phonid="2">^uw~th-r+iy=pau 1 1 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LH</phon><phon val="iy" type="nucleus" phonid="3">^th~r-iy+pau=0 2 0 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LH</phon></syl></tk><break type="4" time="0.4"><tk pron="pau"><syl val="pau"><phon val="pau">^r~iy-pau+0=0 0 0 1 0 0 NN PAU PAU 3 0 0 1 0 0 3 LH</phon></syl></tk></break></spt></utt></parent>
</ws><break type="4" time="0.011"><tk pron="pau"><syl val="pau"><phon val="pau">^0~0-pau+hh=ah 0 0 0 0 0 PAU PAU NN 0 0 2 0 0 2 2 LH</phon></syl></tk></break><tk norm="hello" lc="true" uc="true" pos="NN" posset="1" wordid="1" pron="hh ah0 l ow1" altprons="hh eh0 l ow1, hh ah0 l ow1" spron="hh+ah0|l+ow1|" nosyl="2">Hello<ws> </ws><syl val="hh+ah0" stress="0" sylid="1" nophons="2"><phon val="hh" type="onset" phonid="1">^0~pau-hh+ah=l 0 1 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LH</phon><phon val="ah" type="nucleus" phonid="2">^pau~hh-ah+l=ow 1 0 0 0 1 PAU NN EX 0 2 2 0 2 1 2 LH</phon></syl><syl val="l+ow1" stress="1" sylid="2" nophons="2"><phon val="l" type="onset" phonid="1">^hh~ah-l+ow=dh 0 1 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LH</phon><phon val="ow" type="nucleus" phonid="2">^ah~l-ow+dh=eh 1 0 0 1 1 PAU NN EX 2 2 3 0 2 1 2 LH</phon></syl></tk><tk norm="there" pstpunc="," lc="true" pos="EX" posset="3" wordid="2" pron="dh eh1 r" spron="dh+eh1+r|" nosyl="1">there,<ws> </ws><syl val="dh+eh1+r" stress="1" sylid="1" nophons="3"><phon val="dh" type="onset" phonid="1">^l~ow-dh+eh=r 0 2 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LH</phon><phon val="eh" type="nucleus" phonid="2">^ow~dh-eh+r=pau 1 1 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LH</phon><phon val="r" type="coda" phonid="3">^dh~eh-r+pau=pau 2 0 1 1 0 NN EX PAU 2 3 0 2 1 0 2 LH</phon></syl></tk><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^eh~r-pau+pau=0 0 0 1 0 0 EX PAU PAU 3 0 0 1 0 0 2 LH</phon></syl></tk></break></spt><spt phraseid="2" no_wrds="3"><break type="3" time="0.1"><tk pron="pau"><syl val="pau"><phon val="pau">^0~pau-pau+w=ah 0 0 0 0 1 PAU PAU NN 0 0 3 0 0 1 3 LL</phon></syl></tk></break><tk norm="1" symbols="true" pos="NN" posset="1" wordid="1" pron="w ah1 n" spron="w+ah1+n|" nosyl="1">1<ws> </ws><syl val="w+ah1+n" stress="1" sylid="1" nophons="3"><phon val="w" type="onset" phonid="1">^pau~pau-w+ah=n 0 2 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LL</phon><phon val="ah" type="nucleus" phonid="2">^pau~w-ah+n=t 1 1 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LL</phon><phon val="n" type="coda" phonid="3">^w~ah-n+t=uw 2 0 0 1 1 PAU NN NN 0 3 2 0 1 1 3 LL</phon></syl></tk><tk norm="2" symbols="true" pos="NN" posset="1" wordid="2" pron="t uw1" spron="t+uw1|" nosyl="1">2<ws> </ws><syl val="t+uw1" stress="1" sylid="1" nophons="2"><phon val="t" type="onset" phonid="1">^ah~n-t+uw=th 0 1 1 1 1 NN NN NN 3 2 3 1 1 1 3 LL</phon><phon val="uw" type="nucleus" phonid="2">^n~t-uw+th=r 1 0 1 1 1 NN NN NN 3 2 3 1 1 1 3 LL</phon></syl></tk><tk norm="3" pstpunc="." symbols="true" pos="NN" posset="1" wordid="3" pron="th r iy1" spron="th_r+iy1|" nosyl="1">3.<ws col="20">
</ws><syl val="th_r+iy1" stress="1" sylid="1" nophons="3"><phon val="th" type="onset" phonid="1">^t~uw-th+r=iy 0 2 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LL</phon><phon val="r" type="onset" phonid="2">^uw~th-r+iy=pau 1 1 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LL</phon><phon val="iy" type="nucleus" phonid="3">^th~r-iy+pau=0 2 0 1 1 0 NN NN PAU 2 3 0 1 1 0 3 LL</phon></syl></tk><break type="4" time="0.4"><tk pron="pau"><syl val="pau"><phon val="pau">^r~iy-pau+0=0 0 0 1 0 0 NN PAU PAU 3 0 0 1 0 0 3 LL</phon></syl></tk></break></spt></utt></parent>
Loading

0 comments on commit 0dcf922

Please sign in to comment.