diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java index 062e6c205a..ee0aa1d6a6 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java @@ -17,6 +17,21 @@ * stuck to each of the words. We can separate that out by using two * regex, one which matches the " in a group, one which matches the * rest of the word without the " + *
+ * Aside from the text and the dependency, the new node is rather bare bones. + * Adding the -name argument allows for specifying a comma-separate list + * of names which can be used to insert the new nodes into the SemgrexMatcher + * as named nodes. This will allow for further edits in the same edit step. + * This list should be 0 indexed. + *
+ * For example, this will split "foobar" and put the pos ADJ on the first word + *
+ * semgrex:
+ *   {word:/foobar/}=split
+ * ssurgeon:
+ *   splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf
+ *   editNode -node asdf -pos ADJ
+ * 
* * @author John Bauer */ @@ -27,8 +42,9 @@ public class SplitWord extends SsurgeonEdit { final List nodeRegex; final int headIndex; final GrammaticalRelation relation; + final Map nodeNames; - public SplitWord(String node, List nodeRegex, Integer headIndex, GrammaticalRelation relation) { + public SplitWord(String node, List nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) { if (node == null) { throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split"); } @@ -54,6 +70,24 @@ public SplitWord(String node, List nodeRegex, Integer headIndex, Grammat throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words"); } this.relation = relation; + + if (nodeNames != null) { + String[] namePieces = nodeNames.split(","); + this.nodeNames = new HashMap<>(); + for (String namePiece : namePieces) { + String[] pieces = namePiece.split("=", 2); + if (pieces.length < 2) { + throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names. Should look like 0=foo,1=bar"); + } + int idx = Integer.valueOf(pieces[0]); + if (idx >= this.nodeRegex.size()) { + throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)"); + } + this.nodeNames.put(idx, pieces[1]); + } + } else { + this.nodeNames = Collections.emptyMap(); + } } @Override @@ -114,8 +148,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) { matchedNode.setValue(words.get(headIndex)); for (int i = 0; i < nodeRegex.size(); ++i) { - if (i == headIndex) + if (i == headIndex) { + if (nodeNames.containsKey(i)) { + sm.putNode(nodeNames.get(i), matchedNode); + } continue; + } // otherwise, add a word with the appropriate index, // then connect it to matchedNode @@ -129,7 +167,12 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) { sg.addVertex(newNode); sg.addEdge(matchedNode, newNode, relation, 0.0, false); + + if (nodeNames.containsKey(i)) { + sm.putNode(nodeNames.get(i), newNode); + } } + return true; } } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java index a683b0b8f9..ae064b4e94 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java @@ -625,7 +625,7 @@ public static SsurgeonEdit parseEditLine(String editLine, Map at return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word")); } else if (command.equalsIgnoreCase(SplitWord.LABEL)) { GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln); - return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln); + return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name); } throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported"); } catch (SsurgeonParseException e) { diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java index a09a6e5cb8..da1cb5c378 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java @@ -2006,6 +2006,44 @@ public void readXMLSplitTwoWordsAfter() { assertEquals(newSg, expected); } + /** + * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1 + */ + @Test + public void readXMLSplitTwoWordsNamed() { + String doc = String.join(newline, + "", + " ", + " 38", + " Test splitting a word into two pieces with the head at the start", + " UniversalEnglish", + " " + XMLUtils.escapeXML("{word:/foobar/}=split") + "", + " splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1 -name 0=asdf", + " editNode -node asdf -pos ADJ", + " ", + ""); + Ssurgeon inst = Ssurgeon.inst(); + List patterns = inst.readFromString(doc); + assertEquals(patterns.size(), 1); + SsurgeonPattern pattern = patterns.get(0); + + SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]"); + SemanticGraph newSg = pattern.iterate(sg).first; + SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]"); + assertEquals(newSg, expected); + + boolean found = false; + for (IndexedWord word : newSg.vertexSet()) { + if (word.index() == 2) { + assertEquals("ADJ", word.get(CoreAnnotations.PartOfSpeechAnnotation.class)); + found = true; + } else { + assertEquals(null, word.get(CoreAnnotations.PartOfSpeechAnnotation.class)); + } + } + assertTrue(found); + } + /** * Test splitWord, which should split a word into pieces based on regex matches, with three pieces */