diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java new file mode 100644 index 000000000000..0e297c483dd9 --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -0,0 +1,151 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.util.ULocale; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +public class LocalizedSegmenter implements Segmenter { + + private ULocale locale; + + private SegmentationType segmentationType; + + @Override + public Segments segment(CharSequence s) { + return new LocalizedSegments(s, this); + } + + public static Builder builder() { + return new Builder(); + } + + LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) { + this.locale = locale; + this.segmentationType = segmentationType; + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Override + @Deprecated + public BreakIterator getNewBreakIterator() { + BreakIterator breakIter; + switch (this.segmentationType) { + case LINE: + breakIter = BreakIterator.getLineInstance(this.locale); + break; + case SENTENCE: + breakIter = BreakIterator.getSentenceInstance(this.locale); + break; + case WORD: + breakIter = BreakIterator.getWordInstance(this.locale); + break; + case GRAPHEME_CLUSTER: + default: + breakIter = BreakIterator.getCharacterInstance(this.locale); + break; + } + return breakIter; + } + + public enum SegmentationType { + GRAPHEME_CLUSTER, + WORD, + LINE, + SENTENCE, + } + + public static class Builder { + + private ULocale locale = ULocale.ROOT; + + private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER; + + Builder() { } + + public Builder setLocale(ULocale locale) { + this.locale = locale; + return this; + } + + public Builder setSegmentationType(SegmentationType segmentationType) { + this.segmentationType = segmentationType; + return this; + } + + public LocalizedSegmenter build() { + return new LocalizedSegmenter(this.locale, this.segmentationType); + } + + } + + public class LocalizedSegments implements Segments { + + private CharSequence source; + + private LocalizedSegmenter segmenter; + + private BreakIterator breakIter; + + private LocalizedSegments(CharSequence source, LocalizedSegmenter segmenter) { + this.source = source; + this.segmenter = segmenter; + this.breakIter = this.segmenter.getNewBreakIterator(); + } + + @Override + public Stream subSequences() { + return SegmentsImplUtils.subSequences(this.breakIter, this.source); + } + + @Override + public Segment segmentAt(int i) { + return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i); + } + + @Override + public Stream segments() { + return SegmentsImplUtils.segments(this.breakIter, this.source); + } + + @Override + public boolean isBoundary(int i) { + return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i); + } + + @Override + public Stream segmentsFrom(int i) { + return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i); + } + + @Override + public Stream segmentsBefore(int i) { + return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); + } + + @Override + public Function segmentToSequenceFn() { + return SegmentsImplUtils.segmentToSequenceFn(this.source); + } + + @Override + public IntStream boundaries() { + return SegmentsImplUtils.boundaries(this.breakIter, this.source); + } + + @Override + public IntStream boundariesAfter(int i) { + return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i); + } + + @Override + public IntStream boundariesBackFrom(int i) { + return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i); + } + } + +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java new file mode 100644 index 000000000000..18f32ae78fd9 --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -0,0 +1,115 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +public class RuleBasedSegmenter implements Segmenter { + + private String rules; + + @Override + public Segments segment(CharSequence s) { + return new RuleBasedSegments(s, this); + } + + public static Builder builder() { + return new Builder(); + } + + RuleBasedSegmenter(String rules) { + this.rules = rules; + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Override + @Deprecated + public RuleBasedBreakIterator getNewBreakIterator() { + return new RuleBasedBreakIterator(this.rules); + } + + public static class Builder { + + String rules; + + Builder() { } + + public Builder setRules(String rules) { + this.rules = rules; + return this; + } + + public RuleBasedSegmenter build() { + return new RuleBasedSegmenter(this.rules); + } + } + + public static class RuleBasedSegments implements Segments { + private CharSequence source; + + private RuleBasedSegmenter segmenter; + + private BreakIterator breakIter; + + RuleBasedSegments(CharSequence source, RuleBasedSegmenter segmenter) { + this.source = source; + this.segmenter = segmenter; + this.breakIter = this.segmenter.getNewBreakIterator(); + } + + @Override + public Stream subSequences() { + return SegmentsImplUtils.subSequences(this.breakIter, this.source); + } + + @Override + public Segment segmentAt(int i) { + return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i); + } + + @Override + public Stream segments() { + return SegmentsImplUtils.segments(this.breakIter, this.source); + } + + @Override + public boolean isBoundary(int i) { + return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i); + } + + @Override + public Stream segmentsFrom(int i) { + return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i); + } + + @Override + public Stream segmentsBefore(int i) { + return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); + } + + @Override + public Function segmentToSequenceFn() { + return SegmentsImplUtils.segmentToSequenceFn(this.source); + } + + @Override + public IntStream boundaries() { + return SegmentsImplUtils.boundaries(this.breakIter, this.source); + } + + @Override + public IntStream boundariesAfter(int i) { + return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i); + } + + @Override + public IntStream boundariesBackFrom(int i) { + return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i); + } + } +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java new file mode 100644 index 000000000000..f761e08d8dab --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java @@ -0,0 +1,15 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; + +public interface Segmenter { + Segments segment(CharSequence s); + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + BreakIterator getNewBreakIterator(); + +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java new file mode 100644 index 000000000000..8840b1c96fce --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -0,0 +1,221 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; +import java.util.Iterator; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +public interface Segments { + + Stream subSequences(); + + Segment segmentAt(int i); + + Stream segments(); + + Stream segmentsFrom(int i); + + Stream segmentsBefore(int i); + + Function segmentToSequenceFn(); + + /** + * Returns whether offset {@code i} is a segmentation boundary. Throws an exception when + * {@code i} is not a valid boundary position for the source sequence. + * @param i + * @return + */ + boolean isBoundary(int i); + + IntStream boundaries(); + + IntStream boundariesAfter(int i); + + IntStream boundariesBackFrom(int i); + + // + // Inner enums/classes in common for other inner classes + // + + enum IterationDirection { + FORWARDS, + BACKWARDS, + } + + // + // Inner classes for Segment, SegmentIterable, and SegmentIterator + // + + // TODO: consider options in design for potential memory usage optimization: + // 1) keep simple class with public fields, but requires field per Segment to point to source + // 2) make Segment an interface (getSource, getStart, getLimit, getRuleStatus, newSegment), and + // maybe an abstract class that implements the interface, maybe with a default method impl + // for convenience for getting (allocating & returning) the subsequence + class Segment { + public final int start; + public final int limit; + public final int ruleStatus = 0; + public final CharSequence source; + + public Segment(int start, int limit, CharSequence source) { + this.start = start; + this.limit = limit; + this.source = source; + } + } + + /** + * This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn + * enables the creation of a lazy {@code Stream}. + */ + class SegmentIterable implements Iterable { + BreakIterator breakIter; + final IterationDirection direction; + int startIdx; + final CharSequence source; + + SegmentIterable(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) { + this.breakIter = breakIter; + this.direction = direction; + this.startIdx = startIdx; + this.source = source; + } + + @Override + public Iterator iterator() { + return new SegmentIterator(this.breakIter, this.direction, this.startIdx, this.source); + } + } + + class SegmentIterator implements Iterator { + BreakIterator breakIter; + final IterationDirection direction; + int start; + int limit; + final CharSequence source; + + SegmentIterator(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) { + this.breakIter = breakIter; + this.direction = direction; + this.source = source; + + Segment segmentAtIdx = SegmentsImplUtils.segmentAt(breakIter, source, startIdx); + + if (segmentAtIdx == null) { + this.start = BreakIterator.DONE; + } else if (direction == IterationDirection.FORWARDS) { + this.start = segmentAtIdx.start; + this.limit = breakIter.following(this.start); + } else { + assert direction == IterationDirection.BACKWARDS; + if (breakIter.isBoundary(startIdx)) { + // Note: breakIter::isBoundary is a stateful operation. It resets the position in the + // BreakIterator, which we want to ensure that the position is where we think it is. + this.start = startIdx; + } else { + // Since we already called BreakIterator.isBoundary() which mutates the BreakIterator + // position to increment forwards when the return value is false, we should call + // BreakIterator.previous() to update the iterator position while getting the start value + // of the segment at startIdx + this.start = breakIter.previous(); + } + this.limit = getDirectionBasedNextIdx(); + } + } + + int getDirectionBasedNextIdx() { + if (direction == IterationDirection.FORWARDS) { + return breakIter.next(); + } else { + assert direction == IterationDirection.BACKWARDS; + return breakIter.previous(); + } + } + + @Override + public boolean hasNext() { + return this.limit != BreakIterator.DONE; + } + + @Override + public Segment next() { + Segment result; + if (this.limit < this.start) { + result = new Segment(this.limit, this.start, this.source); + } else { + result = new Segment(this.start, this.limit, this.source); + } + + this.start = this.limit; + this.limit = getDirectionBasedNextIdx(); + + return result; + } + } + + // + // Inner classes for BoundaryIterable and BoundaryIterator + // + + class BoundaryIterable implements Iterable { + BreakIterator breakIter; + IterationDirection direction; + int startIdx; + + BoundaryIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) { + this.breakIter = breakIter; + this.direction = direction; + this.startIdx = startIdx; + } + + @Override + public Iterator iterator() { + return new BoundaryIterator(this.breakIter, this.direction, this.startIdx); + } + } + + class BoundaryIterator implements Iterator { + BreakIterator breakIter; + IterationDirection direction; + int currIdx; + + BoundaryIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) { + this.breakIter = breakIter; + this.direction = direction; + + // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs + if (startIdx < 0 && direction == IterationDirection.BACKWARDS) { + this.currIdx = BreakIterator.DONE; + return; + } + + if (direction == IterationDirection.FORWARDS) { + this.currIdx = breakIter.following(startIdx); + } else { + assert direction == IterationDirection.BACKWARDS; + this.currIdx = breakIter.preceding(startIdx); + } + } + + @Override + public boolean hasNext() { + return this.currIdx != BreakIterator.DONE; + } + + @Override + public Integer next() { + int result = this.currIdx; + + if (direction == IterationDirection.FORWARDS) { + this.currIdx = breakIter.next(); + } else { + assert direction == IterationDirection.BACKWARDS; + this.currIdx = breakIter.previous(); + } + + return result; + } + } + +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java new file mode 100644 index 000000000000..09a521c2e6d5 --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -0,0 +1,148 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.segmenter.Segments.BoundaryIterable; +import com.ibm.icu.text.segmenter.Segments.IterationDirection; +import com.ibm.icu.text.segmenter.Segments.Segment; +import com.ibm.icu.text.segmenter.Segments.SegmentIterable; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + + +// Global TODO: make initialization of breakIterator a prerequisite +public class SegmentsImplUtils { + + public static boolean isBoundary(BreakIterator breakIter, CharSequence source, int i) { + breakIter.setText(source); + + return breakIter.isBoundary(i); + } + + public static Stream subSequences(BreakIterator breakIter, CharSequence sourceSequence) { + return segments(breakIter, sourceSequence).map(segmentToSequenceFn(sourceSequence)); + } + + public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) { + // TODO: make initialization of breakIterator a prerequisite + breakIter.setText(sourceSequence); + + int start; + int limit; + + boolean isBoundary = breakIter.isBoundary(i); + + if (isBoundary) { + start = i; + limit = breakIter.next(); + } else { + // BreakIterator::isBoundary(i) will advance forwards to the next boundary if the argument + // is not a boundary. + limit = breakIter.current(); + start = breakIter.previous(); + } + + if (start != BreakIterator.DONE && limit != BreakIterator.DONE) { + return new Segment(start, limit, sourceSequence); + } else { + return null; + } + } + + public static Stream segments(BreakIterator breakIter, CharSequence sourceSequence) { + return segmentsFrom(breakIter, sourceSequence, 0); + } + + public static Stream segmentsFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i, sourceSequence); + return StreamSupport.stream(iterable.spliterator(), false); + } + + public static Stream segmentsBefore(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i, sourceSequence); + return StreamSupport.stream(iterable.spliterator(), false); + } + + public static Segment segmentAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + int start = breakIter.following(i); + if (start == BreakIterator.DONE) { + return null; + } + + int limit = breakIter.next(); + if (limit == BreakIterator.DONE) { + return null; + } + + return new Segment(start, limit, sourceSequence); + } + + public static Segment segmentBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + + // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs + if (i < 0) { + // return the same thing as we would if preceding() returned DONE + return null; + } + + int start = breakIter.preceding(i); + int limit = breakIter.previous(); + + if (start == BreakIterator.DONE || limit == BreakIterator.DONE) { + return null; + } + + assert limit <= start; + + return new Segment(limit, start, sourceSequence); + } + + public static Function segmentToSequenceFn(CharSequence sourceSequence) { + return segment -> sourceSequence.subSequence(segment.start, segment.limit); + } + + public static IntStream boundaries(BreakIterator breakIter, CharSequence sourceSequence) { + return boundariesAfter(breakIter, sourceSequence, -1); + } + + public static IntStream boundariesAfter(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + // TODO: optimize IntStream creation to avoid autoboxing + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.FORWARDS, i); + Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); + return boundariesAsIntegers.mapToInt(Integer::intValue); + } + + public static IntStream boundariesBackFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) { + // TODO: make initialization of breakIterator a prerequisite + breakIter.setText(sourceSequence); + + int sourceLength = sourceSequence.length(); + if (i < 0) { + return IntStream.empty(); + } + + boolean isOnBoundary = i <= sourceLength && isBoundary(breakIter, sourceSequence, i); + int backFromIdx = isOnBoundary ? i + 1 : i; + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + // TODO: optimize IntStream creation to avoid autoboxing + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, backFromIdx); + Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); + return boundariesAsIntegers.mapToInt(Integer::intValue); + } + +} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java index dc7b59d873ef..ab9de9cda46d 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java @@ -46,7 +46,7 @@ public Object[][] getContents() { // all of which should not influence the algorithm "$_ignore_=[[:Mn:][:Me:][:Cf:]];" - // lower and upper case Roman letters, apostrophy and dash are + // lower and upper case Roman letters, apostrophe and dash are // in the English dictionary +"$_dictionary_=[a-zA-Z\\'\\-];" @@ -64,7 +64,7 @@ public Object[][] getContents() { +"$mid_word=[[:Pd:]\u00ad\u2027\\\"\\\'];" // punctuation that can occur in the middle of a number: currently - // apostrophes, qoutation marks, periods, commas, and the Arabic + // apostrophes, quotation marks, periods, commas, and the Arabic // decimal point +"$mid_num=[\\\"\\\'\\,\u066b\\.];" diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java new file mode 100644 index 000000000000..810b388d3482 --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java @@ -0,0 +1,47 @@ +package com.ibm.icu.dev.test.text.segmenter; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.text.segmenter.LocalizedSegmenter; +import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.segmenter.Segmenter; +import com.ibm.icu.text.segmenter.Segments; +import com.ibm.icu.util.ULocale; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class LocalizedSegmenterTest extends CoreTestFmwk { + + @Test + public void testLocaleInLocalizedSegmenter() { + String source = "Die 21en Jahrh. ist die Beste."; + + Object[][] casesData = { + {"de", Arrays.asList("Die 21en Jahrh. ist die Beste.")}, + }; + + for (Object[] caseDatum : casesData) { + String localeTag = (String) caseDatum[0]; + ULocale locale = ULocale.forLanguageTag(localeTag); + List expWords = (List) caseDatum[1]; + + Segmenter wordSeg = + LocalizedSegmenter.builder() + .setLocale(locale) + .setSegmentationType(SegmentationType.SENTENCE) + .build(); + Segments segments = wordSeg.segment(source); + + List actWords = segments.subSequences().collect(Collectors.toList()); + + assertThat(actWords, is(expWords)); + } + } +} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java new file mode 100644 index 000000000000..5e46fe608038 --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java @@ -0,0 +1,49 @@ +package com.ibm.icu.dev.test.text.segmenter; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.text.segmenter.RuleBasedSegmenter; +import com.ibm.icu.text.segmenter.Segmenter; +import com.ibm.icu.text.segmenter.Segments; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RuleBasedSegmenterTest extends CoreTestFmwk { + + @Test + public void testRules() { + String source = "hejsan k:a tack"; + + Object[][] casesData = { + {"default", ".*;", Arrays.asList("hejsan k:a tack")}, + // TODO: add more cases once RBBI rule syntax is understood + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + String subrule = (String) caseDatum[1]; + List expWords = (List) caseDatum[2]; + + // the following rule substring was taken as a subset from BreakIteratorRules_en_US_TEST.java: + String rules = subrule; + + Segmenter seg = RuleBasedSegmenter.builder() + .setRules(rules) + .build(); + Segments segments = seg.segment(source); + + List actWords = segments.subSequences().collect(Collectors.toList()); + + assertThat(desc, actWords, is(expWords)); + } + + } + +} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java new file mode 100644 index 000000000000..f22be7231f8d --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -0,0 +1,373 @@ +package com.ibm.icu.dev.test.text.segmenter; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.text.segmenter.LocalizedSegmenter; +import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.segmenter.Segmenter; +import com.ibm.icu.text.segmenter.Segments; +import com.ibm.icu.text.segmenter.Segments.Segment; +import com.ibm.icu.util.ULocale; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class SegmentsTest extends CoreTestFmwk { + + @Test + public void testSegments() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.segments().collect(Collectors.toList()); + + assertEquals("first range start", 0, segments.get(0).start); + assertEquals("first range limit", 3, segments.get(0).limit); + + assertEquals("second range start", 3, segments.get(1).start); + assertEquals("second range limit", 4, segments.get(1).limit); + } + + @Test + public void testMultipleSegmentObjectsFromSegmenter() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + String source2 = "Sphinx of black quartz, judge my vow."; + String source3 = "How vexingly quick daft zebras jump!"; + + List exp1 = Arrays.asList("The", " ", "quick", " ", "brown", " ", "fox", " ", + "jumped", " ", "over", " ", "the", " ", "lazy", " ", "dog", "."); + List exp2 = Arrays.asList("Sphinx", " ", "of", " ", "black", " ", "quartz", ",", + " ", "judge", " ", "my", " ", "vow", "."); + List exp3 = Arrays.asList("How", " ", "vexingly", " ", "quick", " ", "daft", " ", + "zebras", " ", "jump", "!"); + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + List act1 = segments1.subSequences().collect(Collectors.toList()); + assertThat(act1, is(exp1)); + + // Create new Segments for source2 + Segments segments2 = enWordSegmenter.segment(source2); + List act2 = segments2.subSequences().collect(Collectors.toList()); + assertThat(act2, is(exp2)); + + // Check that Segments for source1 is unaffected + act1 = segments1.subSequences().collect(Collectors.toList()); + assertThat(act1, is(exp1)); + + // Create new Segments for source3 + Segments segments3 = enWordSegmenter.segment(source3); + List act3 = segments3.subSequences().collect(Collectors.toList()); + assertThat(act3, is(exp3)); + + // Check that Segments for source1 is unaffected + act1 = segments1.subSequences().collect(Collectors.toList()); + assertThat(act1, is(exp1)); + + // Check that Segments for source2 is unaffected + act2 = segments2.subSequences().collect(Collectors.toList()); + assertThat(act2, is(exp2)); + } + + @Test + public void testIsBoundary() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + Object[][] casesData = { + {"start of segment", 4, true}, + {"between start and limit of segment", 6, false}, + {"limit of segment", 9, true}, + {"beginning of string", 0, true}, + {"end of string", source1.length(), true}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int idx = (int) caseDatum[1]; + boolean exp = (boolean) caseDatum[2]; + + assertThat(desc, segments1.isBoundary(idx) == exp); + } + } + + @Test + public void testSegmentsFrom_middleOfSegment() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 1; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 0, segments.get(0).start); + assertEquals("first range limit", 3, segments.get(0).limit); + + assertEquals("second range start", 3, segments.get(1).start); + assertEquals("second range limit", 4, segments.get(1).limit); + } + + @Test + public void testSegmentsFrom_onBoundary() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 3; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); + + assertEquals("second range start", 4, segments.get(1).start); + assertEquals("second range limit", 9, segments.get(1).limit); + } + + @Test + public void testSegmentsBefore_middleOfSegment() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 8; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); + + assertEquals("second range start", 0, segments.get(1).start); + assertEquals("second range limit", 3, segments.get(1).limit); + } + + @Test + public void testSegmentsBefore_onBoundary() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 9; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 4, segments.get(0).start); + assertEquals("first range limit", 9, segments.get(0).limit); + + assertEquals("second range start", 3, segments.get(1).start); + assertEquals("second range limit", 4, segments.get(1).limit); + } + + @Test + public void testSegmentToSequenceFn() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 10; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List exp1 = Arrays.asList(" ", "quick", " ", "The"); + + List act1 = segments1.segmentsBefore(startIdx) + .map(segments1.segmentToSequenceFn()) + .collect(Collectors.toList()); + + assertThat(act1, is(exp1)); + } + + @Test + public void testBoundaries() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source + Segments segments = enWordSegmenter.segment(source); + + int[] exp = {0, 3, 4, 9, 10, 15, 16, 19, 20, 26, 27, 31, 32, 35, 36, 40, 41, 44, 45}; + + int[] act = segments.boundaries().toArray(); + + assertThat(act, is(exp)); + } + + @Test + public void testBoundariesAfter() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + int TAKE_LIMIT = 5; + + // Create new Segments for source + Segments segments = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"first " + TAKE_LIMIT + " before beginning", -2, new int[]{0, 3, 4, 9, 10}}, + {"first " + TAKE_LIMIT + " in the middle of the third segment", 5, new int[]{9, 10, 15, 16, 19}}, + {"first " + TAKE_LIMIT + " on the limit of the third segment", 9, new int[]{10, 15, 16, 19, 20}}, + {"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]}, + {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[0]}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + int[] exp = (int[]) caseDatum[2]; + + int[] act = segments.boundariesAfter(startIdx).limit(TAKE_LIMIT).toArray(); + + assertThat(desc, act, is(exp)); + } + } + + @Test + public void testBoundariesBackFrom() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + int TAKE_LIMIT = 5; + + // Create new Segments for source + Segments segments = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"first " + TAKE_LIMIT + " before beginning", -2, new int[0]}, + {"first " + TAKE_LIMIT + " at the beginning", 0, new int[]{0}}, + {"first " + TAKE_LIMIT + " from the start of the 2nd to last segment", 41, new int[]{41, 40, 36, 35, 32}}, + {"first " + TAKE_LIMIT + " in the middle of the 2nd to last segment", 42, new int[]{41, 40, 36, 35, 32}}, + {"first " + TAKE_LIMIT + " at the end", source.length(), new int[]{45, 44, 41, 40, 36}}, + {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[]{45, 44, 41, 40, 36}}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + int[] exp = (int[]) caseDatum[2]; + + int[] act = segments.boundariesBackFrom(startIdx).limit(TAKE_LIMIT).toArray(); + + assertThat(desc, act, is(exp)); + + if (startIdx < 0) { + logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); + } + } + } + + @Test + public void testSegmentAt() { + Segmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source + Segments segments1 = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"index before beginning", -2, null, null}, + {"index at beginning", 0, 0, 3}, + {"index in the middle of the first segment", 2, 0, 3}, + {"index in the middle of the third segment", 5, 4, 9}, + {"index at the end", source.length()-1, 44, 45}, + {"index after the end", source.length()+1, null, null}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + Integer expStart = (Integer) caseDatum[2]; + Integer expLimit = (Integer) caseDatum[3]; + + if (startIdx < 0 ) { + logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); + } + + if (expStart == null) { + assertThat("Out of bounds range should be null", expLimit == null); + } else { + Segment segment = segments1.segmentAt(startIdx); + + assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start); + assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit); + } + } + + + } + +}