Skip to content

Commit

Permalink
Add the ability to mark newly created nodes with names in the Semgrex…
Browse files Browse the repository at this point in the history
…Matcher, allowing for a compound operation which then assigns more fields to that node
  • Loading branch information
AngledLuffa committed Jul 2, 2024
1 parent 13ede5a commit 0e39b37
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 3 deletions.
47 changes: 45 additions & 2 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,21 @@
* stuck to each of the words. We can separate that out by using two
* regex, one which matches the " in a group, one which matches the
* rest of the word without the "
* <br>
* Aside from the text and the dependency, the new node is rather bare bones.
* Adding the -name argument allows for specifying a comma-separate list
* of names which can be used to insert the new nodes into the SemgrexMatcher
* as named nodes. This will allow for further edits in the same edit step.
* This list should be 0 indexed.
* <br>
* For example, this will split "foobar" and put the pos ADJ on the first word
* <pre>
* semgrex:
* {word:/foobar/}=split
* ssurgeon:
* splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf
* editNode -node asdf -pos ADJ
* </pre>
*
* @author John Bauer
*/
Expand All @@ -27,8 +42,9 @@ public class SplitWord extends SsurgeonEdit {
final List<Pattern> nodeRegex;
final int headIndex;
final GrammaticalRelation relation;
final Map<Integer, String> nodeNames;

public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation) {
public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) {
if (node == null) {
throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
}
Expand All @@ -54,6 +70,24 @@ public SplitWord(String node, List<String> nodeRegex, Integer headIndex, Grammat
throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
}
this.relation = relation;

if (nodeNames != null) {
String[] namePieces = nodeNames.split(",");
this.nodeNames = new HashMap<>();
for (String namePiece : namePieces) {
String[] pieces = namePiece.split("=", 2);
if (pieces.length < 2) {
throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names. Should look like 0=foo,1=bar");
}
int idx = Integer.valueOf(pieces[0]);
if (idx >= this.nodeRegex.size()) {
throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)");
}
this.nodeNames.put(idx, pieces[1]);
}
} else {
this.nodeNames = Collections.emptyMap();
}
}

@Override
Expand Down Expand Up @@ -114,8 +148,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
matchedNode.setValue(words.get(headIndex));

for (int i = 0; i < nodeRegex.size(); ++i) {
if (i == headIndex)
if (i == headIndex) {
if (nodeNames.containsKey(i)) {
sm.putNode(nodeNames.get(i), matchedNode);
}
continue;
}

// otherwise, add a word with the appropriate index,
// then connect it to matchedNode
Expand All @@ -129,7 +167,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {

sg.addVertex(newNode);
sg.addEdge(matchedNode, newNode, relation, 0.0, false);

if (nodeNames.containsKey(i)) {
sm.putNode(nodeNames.get(i), newNode);
}
}

return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
}
throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
} catch (SsurgeonParseException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2006,6 +2006,44 @@ public void readXMLSplitTwoWordsAfter() {
assertEquals(newSg, expected);
}

/**
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
*/
@Test
public void readXMLSplitTwoWordsNamed() {
String doc = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
" <language>UniversalEnglish</language>",
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
" <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf</edit-list>",
" <edit-list>editNode -node asdf -pos ADJ</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
Ssurgeon inst = Ssurgeon.inst();
List<SsurgeonPattern> patterns = inst.readFromString(doc);
assertEquals(patterns.size(), 1);
SsurgeonPattern pattern = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
SemanticGraph newSg = pattern.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
assertEquals(newSg, expected);

boolean found = false;
for (IndexedWord word : newSg.vertexSet()) {
if (word.index() == 2) {
assertEquals("ADJ", word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
found = true;
} else {
assertEquals(null, word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
}
}
assertTrue(found);
}

/**
* Test splitWord, which should split a word into pieces based on regex matches, with three pieces
*/
Expand Down

0 comments on commit 0e39b37

Please sign in to comment.