diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
index 062e6c205a..ee0aa1d6a6 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
@@ -17,6 +17,21 @@
* stuck to each of the words. We can separate that out by using two
* regex, one which matches the " in a group, one which matches the
* rest of the word without the "
+ *
+ * Aside from the text and the dependency, the new node is rather bare bones.
+ * Adding the -name argument allows for specifying a comma-separate list
+ * of names which can be used to insert the new nodes into the SemgrexMatcher
+ * as named nodes. This will allow for further edits in the same edit step.
+ * This list should be 0 indexed.
+ *
+ * For example, this will split "foobar" and put the pos ADJ on the first word
+ *
+ * semgrex:
+ * {word:/foobar/}=split
+ * ssurgeon:
+ * splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf
+ * editNode -node asdf -pos ADJ
+ *
*
* @author John Bauer
*/
@@ -27,8 +42,9 @@ public class SplitWord extends SsurgeonEdit {
final List nodeRegex;
final int headIndex;
final GrammaticalRelation relation;
+ final Map nodeNames;
- public SplitWord(String node, List nodeRegex, Integer headIndex, GrammaticalRelation relation) {
+ public SplitWord(String node, List nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) {
if (node == null) {
throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
}
@@ -54,6 +70,24 @@ public SplitWord(String node, List nodeRegex, Integer headIndex, Grammat
throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
}
this.relation = relation;
+
+ if (nodeNames != null) {
+ String[] namePieces = nodeNames.split(",");
+ this.nodeNames = new HashMap<>();
+ for (String namePiece : namePieces) {
+ String[] pieces = namePiece.split("=", 2);
+ if (pieces.length < 2) {
+ throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names. Should look like 0=foo,1=bar");
+ }
+ int idx = Integer.valueOf(pieces[0]);
+ if (idx >= this.nodeRegex.size()) {
+ throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)");
+ }
+ this.nodeNames.put(idx, pieces[1]);
+ }
+ } else {
+ this.nodeNames = Collections.emptyMap();
+ }
}
@Override
@@ -114,8 +148,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
matchedNode.setValue(words.get(headIndex));
for (int i = 0; i < nodeRegex.size(); ++i) {
- if (i == headIndex)
+ if (i == headIndex) {
+ if (nodeNames.containsKey(i)) {
+ sm.putNode(nodeNames.get(i), matchedNode);
+ }
continue;
+ }
// otherwise, add a word with the appropriate index,
// then connect it to matchedNode
@@ -129,7 +167,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
sg.addVertex(newNode);
sg.addEdge(matchedNode, newNode, relation, 0.0, false);
+
+ if (nodeNames.containsKey(i)) {
+ sm.putNode(nodeNames.get(i), newNode);
+ }
}
+
return true;
}
}
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
index a683b0b8f9..ae064b4e94 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -625,7 +625,7 @@ public static SsurgeonEdit parseEditLine(String editLine, Map at
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
- return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
+ return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
}
throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
} catch (SsurgeonParseException e) {
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
index a09a6e5cb8..da1cb5c378 100644
--- a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
+++ b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -2006,6 +2006,44 @@ public void readXMLSplitTwoWordsAfter() {
assertEquals(newSg, expected);
}
+ /**
+ * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
+ */
+ @Test
+ public void readXMLSplitTwoWordsNamed() {
+ String doc = String.join(newline,
+ "",
+ " ",
+ " 38",
+ " Test splitting a word into two pieces with the head at the start",
+ " UniversalEnglish",
+ " " + XMLUtils.escapeXML("{word:/foobar/}=split") + "",
+ " splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf",
+ " editNode -node asdf -pos ADJ",
+ " ",
+ "");
+ Ssurgeon inst = Ssurgeon.inst();
+ List patterns = inst.readFromString(doc);
+ assertEquals(patterns.size(), 1);
+ SsurgeonPattern pattern = patterns.get(0);
+
+ SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
+ SemanticGraph newSg = pattern.iterate(sg).first;
+ SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
+ assertEquals(newSg, expected);
+
+ boolean found = false;
+ for (IndexedWord word : newSg.vertexSet()) {
+ if (word.index() == 2) {
+ assertEquals("ADJ", word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
+ found = true;
+ } else {
+ assertEquals(null, word.get(CoreAnnotations.PartOfSpeechAnnotation.class));
+ }
+ }
+ assertTrue(found);
+ }
+
/**
* Test splitWord, which should split a word into pieces based on regex matches, with three pieces
*/