diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 9f3e44984eae..e43e70b36311 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -297,7 +297,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt index fc615f55db21..793163898e00 100644 --- a/icu4c/source/data/brkitr/rules/line_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_cj.txt @@ -298,7 +298,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 2bb9be5845f8..9ff4e17eb3a5 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -306,7 +306,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 15715a225123..428d225f16d9 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -318,7 +318,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt index 87ab33b48a1c..2edf4b3bc33a 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt @@ -331,7 +331,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index c41280c28d1c..bf6dee8c05cd 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -299,7 +299,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 31dd65854cb1..f596454621d0 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -304,7 +304,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt index 85d771fcdbf9..e0bbd00025f9 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt @@ -317,7 +317,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt index 41e05bf4963f..14b118789e7c 100644 --- a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt @@ -310,7 +310,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index c043a0a5d838..e94f76b2ce2e 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1494,104 +1494,6 @@ void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, #if !UCONFIG_NO_REGULAR_EXPRESSIONS -//--------------------------------------------------------------------------------------- -// -// class RBBIMonkeyKind -// -// Monkey Test for Break Iteration -// Abstract interface class. Concrete derived classes independently -// implement the break rules for different iterator types. -// -// The Monkey Test itself uses doesn't know which type of break iterator it is -// testing, but works purely in terms of the interface defined here. -// -//--------------------------------------------------------------------------------------- -class RBBIMonkeyKind { -public: - // Return a UVector of UnicodeSets, representing the character classes used - // for this type of iterator. - virtual const std::vector& charClasses() = 0; - - // Set the test text on which subsequent calls to next() will operate - virtual void setText(const UnicodeString &s) = 0; - - // Find the next break position, starting from the prev break position, or from zero. - // Return -1 after reaching end of string. - virtual int32_t next(int32_t i) = 0; - - // Name of each character class, parallel with charClasses. Used for debugging output - // of characters. - virtual std::vector& characterClassNames(); - - void setAppliedRule(int32_t position, const char* value); - - std::string getAppliedRule(int32_t position); - - virtual ~RBBIMonkeyKind(); - UErrorCode deferredStatus; - - std::string classNameFromCodepoint(const UChar32 c); - unsigned int maxClassNameSize(); - - protected: - RBBIMonkeyKind(); - std::vector classNames; - std::vector appliedRules; - - // Clear `appliedRules` and fill it with empty strings in the size of test text. - void prepareAppliedRules(int32_t size ); - - private: - -}; - -RBBIMonkeyKind::RBBIMonkeyKind() { - deferredStatus = U_ZERO_ERROR; -} - -RBBIMonkeyKind::~RBBIMonkeyKind() { -} - -std::vector& RBBIMonkeyKind::characterClassNames() { - return classNames; -} - -void RBBIMonkeyKind::prepareAppliedRules(int32_t size) { - // Remove all the information in the `appliedRules`. - appliedRules.clear(); - appliedRules.resize(size + 1); -} - -void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) { - appliedRules[position] = value; -} - -std::string RBBIMonkeyKind::getAppliedRule(int32_t position){ - return appliedRules[position]; -} - -std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) { - // Simply iterate through charClasses to find character's class - for (std::size_t aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) { - const UnicodeSet& classSet = charClasses()[aClassNum]; - if (classSet.contains(c)) { - return classNames[aClassNum]; - } - } - U_ASSERT(false); // This should not happen. - return "bad class name"; -} - -unsigned int RBBIMonkeyKind::maxClassNameSize() { - unsigned int maxSize = 0; - for (std::size_t aClassNum = 0; aClassNum < charClasses().size(); aClassNum++) { - auto aClassNumSize = static_cast(classNames[aClassNum].size()); - if (aClassNumSize > maxSize) { - maxSize = aClassNumSize; - } - } - return maxSize; -} namespace { @@ -1697,7 +1599,7 @@ class RemapRule : public SegmentationRule { break; } if (resolved[i].appliedRule != nullptr && - resolved[i].appliedRule->resolution() == BREAK) { + resolved[i].appliedRule->resolution() == BREAK) { printf("Replacement rule at remapped indices %d sqq. spans a break", matcher->start(status)); std::terminate(); @@ -1705,7 +1607,34 @@ class RemapRule : public SegmentationRule { resolved[i].appliedRule = this; resolved[i].indexInRemapped.reset(); } + // While replacing, we need to check that we are not creating + // surrogate pairs. Since appendReplacement performs two + // concatenations (the unreplaced segment and the replacement), we + // need to check in two places: whether the unreplaced segment + // starts with a trailing surrogate that ends up after a leading + // surrogate, and whether the replaced segment starts with a leading + // surrogate that ends up after a trailing surrogate. + // We break the pair by replacing one of the surrogates with U+FFFF, + // which has the same properties for all but line breaking, and the + // same behaviour in line breaking (lb=SG and lb=XX are both treated + // as lb=AL). + std::optional trailingLead; + if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) { + trailingLead = result.length() - 1; + } + matcher->appendReplacement(result, replacement_, status); + + if (trailingLead && *trailingLead + 1 < result.length() && + U16_IS_TRAIL(result[*trailingLead + 1])) { + result.setCharAt(*trailingLead, u'\uFFFF'); + } + + if (matcher->start(status) + offset > 0 && + U16_IS_LEAD(result[matcher->start(status) + offset - 1]) && + U16_IS_TRAIL(result[matcher->start(status) + offset])) { + result.setCharAt(matcher->start(status) + offset, u'\uFFFF'); + } offset = result.length() - *resolved[i].indexInRemapped; } for (; i < static_cast(resolved.size()); ++i) { @@ -1714,7 +1643,17 @@ class RemapRule : public SegmentationRule { } *resolved[i].indexInRemapped += offset; } + + std::optional trailingLead; + if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) { + trailingLead = result.length() - 1; + } matcher->appendTail(result); + if (trailingLead && *trailingLead + 1 < result.length() && + U16_IS_TRAIL(result[*trailingLead + 1])) { + result.setCharAt(*trailingLead, u'\uFFFF'); + } + if (resolved.back().indexInRemapped != result.length()) { std::string indices; for (const auto r : resolved) { @@ -1850,1018 +1789,412 @@ class RegexRule : public SegmentationRule { const Resolution resolution_; }; -} // namespace +} // namespace -//---------------------------------------------------------------------------------------- +//--------------------------------------------------------------------------------------- // -// Random Numbers. We need a long cycle length since we run overnight tests over -// millions of strings involving 1000 random generations per string -// (a 32-bit LCG will not do!), we want and a reasonably small state -// so that we can output it to reproduce failures. +// class RBBIMonkeyKind // -//--------------------------------------------------------------------------------------- -namespace { - -using RandomNumberGenerator = std::ranlux48; -constexpr RandomNumberGenerator::result_type defaultSeed = std::ranlux48_base::default_seed; -static RandomNumberGenerator randomNumberGenerator; - -RandomNumberGenerator deserialize(const std::string& state) { - RandomNumberGenerator result; - std::stringstream(state) >> result; - return result; -} - -std::string serialize(const RandomNumberGenerator& generator) { - std::stringstream result; - result << generator; - return result.str(); -} - -} // namespace - -//------------------------------------------------------------------------------------------ +// Monkey Test for Break Iteration +// Abstract interface class. Concrete derived classes independently +// implement the break rules for different iterator types. // -// class RBBICharMonkey Character (Grapheme Cluster) specific implementation -// of RBBIMonkeyKind. +// The Monkey Test itself uses doesn't know which type of break iterator it is +// testing, but works purely in terms of the interface defined here. // -//------------------------------------------------------------------------------------------ -class RBBICharMonkey: public RBBIMonkeyKind { -public: - RBBICharMonkey(); - virtual ~RBBICharMonkey(); - virtual const std::vector& charClasses() override; - virtual void setText(const UnicodeString &s) override; - virtual int32_t next(int32_t i) override; -private: - std::vector sets; - - UnicodeSet *fCRLFSet; - UnicodeSet *fControlSet; - UnicodeSet *fExtendSet; - UnicodeSet *fZWJSet; - UnicodeSet *fRegionalIndicatorSet; - UnicodeSet *fPrependSet; - UnicodeSet *fSpacingSet; - UnicodeSet *fLSet; - UnicodeSet *fVSet; - UnicodeSet *fTSet; - UnicodeSet *fLVSet; - UnicodeSet *fLVTSet; - UnicodeSet *fHangulSet; - UnicodeSet *fExtendedPictSet; - UnicodeSet *fInCBLinkerSet; - UnicodeSet *fInCBConsonantSet; - UnicodeSet *fInCBExtendSet; - UnicodeSet *fAnySet; - - const UnicodeString *fText; -}; - - -RBBICharMonkey::RBBICharMonkey() { - UErrorCode status = U_ZERO_ERROR; - - fText = nullptr; - - fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); - fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status); - fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status); - fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status); - fRegionalIndicatorSet = - new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); - fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); - fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); - fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); - fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); - fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); - fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); - fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); - fHangulSet = new UnicodeSet(); - fHangulSet->addAll(*fLSet); - fHangulSet->addAll(*fVSet); - fHangulSet->addAll(*fTSet); - fHangulSet->addAll(*fLVSet); - fHangulSet->addAll(*fLVTSet); - - fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); - fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status); - fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status); - fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status); - fAnySet = new UnicodeSet(0, 0x10ffff); - - // Create sets of characters, and add the names of the above character sets. - // In each new ICU release, add new names corresponding to the sets above. - - // Important: Keep class names the same as the class contents. - // TODO(egg): Use logic similar to line breaking. - sets.emplace_back(*fCRLFSet); classNames.emplace_back("CRLF"); - sets.emplace_back(*fControlSet); classNames.emplace_back("Control"); - sets.emplace_back(*fExtendSet); classNames.emplace_back("Extended"); - sets.emplace_back(*fRegionalIndicatorSet); classNames.emplace_back("RegionalIndicator"); - if (!fPrependSet->isEmpty()) { - sets.emplace_back(*fPrependSet); classNames.emplace_back("Prepend"); - } - sets.emplace_back(*fSpacingSet); classNames.emplace_back("Spacing"); - sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul"); - sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ"); - sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict"); - sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker"); - sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant"); - sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend"); - sets.emplace_back(*fAnySet); classNames.emplace_back("Any"); - - if (U_FAILURE(status)) { - deferredStatus = status; - } -} - - -void RBBICharMonkey::setText(const UnicodeString &s) { - fText = &s; - prepareAppliedRules(s.length()); -} - - - -int32_t RBBICharMonkey::next(int32_t prevPos) { - int p0, p1, p2, p3; // Indices of the significant code points around the - // break position being tested. The candidate break - // location is before p2. - - int breakPos = -1; - - UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. - UChar32 cBase; // for (X Extend*) patterns, the X character. - - if (U_FAILURE(deferredStatus)) { - return -1; - } - - // Previous break at end of string. return DONE. - if (prevPos >= fText->length()) { - return -1; - } - - p0 = p1 = p2 = p3 = prevPos; - c3 = fText->char32At(prevPos); - c0 = c1 = c2 = cBase = 0; - (void)p0; // suppress set but not used warning. - (void)c0; - - // Loop runs once per "significant" character position in the input text. - for (;;) { - // Move all of the positions forward in the input string. - p0 = p1; c0 = c1; - p1 = p2; c1 = c2; - p2 = p3; c2 = c3; - - // Advance p3 by one codepoint - p3 = fText->moveIndex32(p3, 1); - c3 = fText->char32At(p3); - - if (p1 == p2) { - // Still warming up the loop. (won't work with zero length strings, but we don't care) - continue; - } - - if (p2 == fText->length()) { - setAppliedRule(p2, "End of String"); - break; - } - - // No Extend or Format characters may appear between the CR and LF, - // which requires the additional check for p2 immediately following p1. - // - if (c1==0x0D && c2==0x0A && p1==(p2-1)) { - setAppliedRule(p2, "GB3 CR x LF"); - continue; - } - - if (fControlSet->contains(c1) || - c1 == 0x0D || - c1 == 0x0A) { - setAppliedRule(p2, "GB4 ( Control | CR | LF ) "); - break; - } - - if (fControlSet->contains(c2) || - c2 == 0x0D || - c2 == 0x0A) { - setAppliedRule(p2, "GB5 ( Control | CR | LF )"); - break; - } - - if (fLSet->contains(c1) && - (fLSet->contains(c2) || - fVSet->contains(c2) || - fLVSet->contains(c2) || - fLVTSet->contains(c2))) { - setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )"); - continue; - } - - if ((fLVSet->contains(c1) || fVSet->contains(c1)) && - (fVSet->contains(c2) || fTSet->contains(c2))) { - setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )"); - continue; - } - - if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && - fTSet->contains(c2)) { - setAppliedRule(p2, "GB8 ( LVT | T) x T"); - continue; - } - - if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) { - if (!fExtendSet->contains(c1)) { - cBase = c1; - } - setAppliedRule(p2, "GB9 x (Extend | ZWJ)"); - continue; - } - - if (fSpacingSet->contains(c2)) { - setAppliedRule(p2, "GB9a x SpacingMark"); - continue; - } - - if (fPrependSet->contains(c1)) { - setAppliedRule(p2, "GB9b Prepend x"); - continue; - } - - if (fInCBConsonantSet->contains(c2)) { - int pi = p1; - bool sawVirama = false; - while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) || - fInCBLinkerSet->contains(fText->char32At(pi)))) { - if (fInCBLinkerSet->contains(fText->char32At(pi))) { - sawVirama = true; - } - pi = fText->moveIndex32(pi, -1); - } - if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) { - setAppliedRule( - p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})"); - continue; - } - } +//--------------------------------------------------------------------------------------- +class RBBIMonkeyKind { + public: + // Return a vector of UnicodeSets, representing the character classes used + // for this type of iterator. + const std::vector &charClasses(); - if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) { - setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic"); - continue; - } + const UnicodeSet &dictionarySet() const; - // Note: The first if condition is a little tricky. We only need to force - // a break if there are three or more contiguous RIs. If there are - // only two, a break following will occur via other rules, and will include - // any trailing extend characters, which is needed behavior. - if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) - && fRegionalIndicatorSet->contains(c2)) { - setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator"); - break; - } - if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { - setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator"); - continue; - } + // Set the test text on which subsequent calls to next() will operate + void setText(const UnicodeString &s); - setAppliedRule(p2, "GB999 Any Any"); - break; - } + // Find the next break position, starting from the prev break position, or from zero. + // Return -1 after reaching end of string. + int32_t next(int32_t i); - breakPos = p2; - return breakPos; -} + // Name of each character class, parallel with charClasses. Used for debugging output + // of characters. + std::vector &characterClassNames(); + void setAppliedRule(int32_t position, const char *value); + std::string getAppliedRule(int32_t position); -const std::vector& RBBICharMonkey::charClasses() { - return sets; -} + virtual ~RBBIMonkeyKind(); + UErrorCode deferredStatus; -RBBICharMonkey::~RBBICharMonkey() { - delete fCRLFSet; - delete fControlSet; - delete fExtendSet; - delete fRegionalIndicatorSet; - delete fPrependSet; - delete fSpacingSet; - delete fLSet; - delete fVSet; - delete fTSet; - delete fLVSet; - delete fLVTSet; - delete fHangulSet; - delete fAnySet; - delete fZWJSet; - delete fExtendedPictSet; - delete fInCBLinkerSet; - delete fInCBConsonantSet; - delete fInCBExtendSet; -} + std::string classNameFromCodepoint(const UChar32 c); + unsigned int maxClassNameSize(); -//------------------------------------------------------------------------------------------ -// -// class RBBIWordMonkey Word Break specific implementation -// of RBBIMonkeyKind. -// -//------------------------------------------------------------------------------------------ -class RBBIWordMonkey: public RBBIMonkeyKind { -public: - RBBIWordMonkey(); - virtual ~RBBIWordMonkey(); - virtual const std::vector& charClasses() override; - virtual void setText(const UnicodeString &s) override; - virtual int32_t next(int32_t i) override; -private: + protected: + RBBIMonkeyKind(); + std::vector classNames; std::vector sets; + std::vector> rules; + UnicodeSet dictionarySet_; - UnicodeSet *fCRSet; - UnicodeSet *fLFSet; - UnicodeSet *fNewlineSet; - UnicodeSet *fRegionalIndicatorSet; - UnicodeSet *fKatakanaSet; - UnicodeSet *fHebrew_LetterSet; - UnicodeSet *fALetterSet; - UnicodeSet *fSingle_QuoteSet; - UnicodeSet *fDouble_QuoteSet; - UnicodeSet *fMidNumLetSet; - UnicodeSet *fMidLetterSet; - UnicodeSet *fMidNumSet; - UnicodeSet *fNumericSet; - UnicodeSet *fFormatSet; - UnicodeSet *fOtherSet = nullptr; - UnicodeSet *fExtendSet; - UnicodeSet *fExtendNumLetSet; - UnicodeSet *fWSegSpaceSet; - UnicodeSet *fDictionarySet = nullptr; - UnicodeSet *fZWJSet; - UnicodeSet *fExtendedPictSet; - - const UnicodeString *fText; -}; - - -RBBIWordMonkey::RBBIWordMonkey() -{ - UErrorCode status = U_ZERO_ERROR; - - fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status); - fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status); - fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status); - fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status); - fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status); - fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status); - fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status); - fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status); - fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status); - fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status); - fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status); - fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status); - fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status); - fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status); - fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status); - // There are some sc=Hani characters with WB=Extend. - // The break rules need to pick one or the other because - // Extend overlapping with something else is messy. - // For Unicode 13, we chose to keep U+16FF0 & U+16FF1 - // in $Han (for $dictionary) and out of $Extend. - fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status); - fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status); - - fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status); - fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status); - if(U_FAILURE(status)) { - IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status)); - deferredStatus = status; - return; - } - - fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); - fDictionarySet->addAll(*fKatakanaSet); - fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status)); - - fALetterSet->removeAll(*fDictionarySet); - - fOtherSet = new UnicodeSet(); - if(U_FAILURE(status)) { - IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status)); - deferredStatus = status; - return; - } + // Clear `appliedRules` and fill it with empty strings in the size of test text. + void prepareAppliedRules(int32_t size); - fOtherSet->complement(); - fOtherSet->removeAll(*fCRSet); - fOtherSet->removeAll(*fLFSet); - fOtherSet->removeAll(*fNewlineSet); - fOtherSet->removeAll(*fKatakanaSet); - fOtherSet->removeAll(*fHebrew_LetterSet); - fOtherSet->removeAll(*fALetterSet); - fOtherSet->removeAll(*fSingle_QuoteSet); - fOtherSet->removeAll(*fDouble_QuoteSet); - fOtherSet->removeAll(*fMidLetterSet); - fOtherSet->removeAll(*fMidNumSet); - fOtherSet->removeAll(*fNumericSet); - fOtherSet->removeAll(*fExtendNumLetSet); - fOtherSet->removeAll(*fWSegSpaceSet); - fOtherSet->removeAll(*fFormatSet); - fOtherSet->removeAll(*fExtendSet); - fOtherSet->removeAll(*fRegionalIndicatorSet); - fOtherSet->removeAll(*fZWJSet); - fOtherSet->removeAll(*fExtendedPictSet); - - // Inhibit dictionary characters from being tested at all. - fOtherSet->removeAll(*fDictionarySet); - - // Add classes and their names - sets.emplace_back(*fCRSet); classNames.emplace_back("CR"); - sets.emplace_back(*fLFSet); classNames.emplace_back("LF"); - sets.emplace_back(*fNewlineSet); classNames.emplace_back("Newline"); - sets.emplace_back(*fRegionalIndicatorSet); classNames.emplace_back("RegionalIndicator"); - sets.emplace_back(*fHebrew_LetterSet); classNames.emplace_back("Hebrew"); - sets.emplace_back(*fALetterSet); classNames.emplace_back("ALetter"); - sets.emplace_back(*fSingle_QuoteSet); classNames.emplace_back("Single Quote"); - sets.emplace_back(*fDouble_QuoteSet); classNames.emplace_back("Double Quote"); - // Omit Katakana from fSets, which omits Katakana characters - // from the test data. They are all in the dictionary set, - // which this (old, to be retired) monkey test cannot handle. - //sets.emplace_back(*fKatakanaSet); - - sets.emplace_back(*fMidLetterSet); classNames.emplace_back("MidLetter"); - sets.emplace_back(*fMidNumLetSet); classNames.emplace_back("MidNumLet"); - sets.emplace_back(*fMidNumSet); classNames.emplace_back("MidNum"); - sets.emplace_back(*fNumericSet); classNames.emplace_back("Numeric"); - sets.emplace_back(*fFormatSet); classNames.emplace_back("Format"); - sets.emplace_back(*fExtendSet); classNames.emplace_back("Extend"); - sets.emplace_back(*fOtherSet); classNames.emplace_back("Other"); - sets.emplace_back(*fExtendNumLetSet); classNames.emplace_back("ExtendNumLet"); - sets.emplace_back(*fWSegSpaceSet); classNames.emplace_back("WSegSpace"); - - sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ"); - sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict"); + private: + std::vector appliedRules; + UnicodeString text; + std::vector resolved; +}; - if (U_FAILURE(status)) { - deferredStatus = status; - } +RBBIMonkeyKind::RBBIMonkeyKind() { + deferredStatus = U_ZERO_ERROR; } -void RBBIWordMonkey::setText(const UnicodeString &s) { - fText = &s; - prepareAppliedRules(s.length()); +RBBIMonkeyKind::~RBBIMonkeyKind() { } +const std::vector &RBBIMonkeyKind::charClasses() { + return sets; } -int32_t RBBIWordMonkey::next(int32_t prevPos) { - int p0, p1, p2, p3; // Indices of the significant code points around the - // break position being tested. The candidate break - // location is before p2. - - int breakPos = -1; - - UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. - - if (U_FAILURE(deferredStatus)) { - return -1; - } - - // Prev break at end of string. return DONE. - if (prevPos >= fText->length()) { - return -1; - } - p0 = p1 = p2 = p3 = prevPos; - c3 = fText->char32At(prevPos); - c0 = c1 = c2 = 0; - (void)p0; // Suppress set but not used warning. - - // Loop runs once per "significant" character position in the input text. - for (;;) { - // Move all of the positions forward in the input string. - p0 = p1; c0 = c1; - p1 = p2; c1 = c2; - p2 = p3; c2 = c3; +const UnicodeSet &RBBIMonkeyKind::dictionarySet() const { + return dictionarySet_; +} - // Advance p3 by X(Extend | Format)* Rule 4 - // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) - do { - p3 = fText->moveIndex32(p3, 1); - c3 = fText->char32At(p3); - if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { - break; +void RBBIMonkeyKind::setText(const UnicodeString &s) { + text = s; + prepareAppliedRules(s.length()); + UnicodeString remapped = s; + resolved.clear(); + resolved.reserve(s.length() + 1); + for (int i = 0; i < s.length() + 1; ++i) { + resolved.emplace_back(i); + } + for (const auto &rule : rules) { + rule->apply(remapped, resolved); + } + for (std::size_t i = 0; i < resolved.size(); ++i) { + if (resolved[i].appliedRule == nullptr) { + if (i > 0 && U16_IS_LEAD(s[i-1]) && U16_IS_TRAIL(s[i])) { + continue; } + printf("Failed to resolve at %zu between U+%04X and U+%04X ", i, s.char32At(i - 1), + s.char32At(i)); + if (resolved[i].indexInRemapped.has_value()) { + printf("which is remapped %zu between U+%04X and U+%04X", *resolved[i].indexInRemapped, + remapped.char32At(*resolved[i].indexInRemapped - 1), + remapped.char32At(*resolved[i].indexInRemapped)); + } + std::terminate(); + } else { + setAppliedRule(i, resolved[i].appliedRule->name().c_str()); } - while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3)); - - - if (p1 == p2) { - // Still warming up the loop. (won't work with zero length strings, but we don't care) - continue; - } - - if (p2 == fText->length()) { - // Reached end of string. Always a break position. - break; - } - - // No Extend or Format characters may appear between the CR and LF, - // which requires the additional check for p2 immediately following p1. - // - if (c1==0x0D && c2==0x0A) { - setAppliedRule(p2, "WB3 CR x LF"); - continue; - } - - if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { - setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)"); - break; - } - if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { - setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)"); - break; - } - - // Not ignoring extend chars, so peek into input text to - // get the potential ZWJ, the character immediately preceding c2. - // Sloppy UChar32 indexing: p2-1 may reference trail half - // but char32At will get the full code point. - if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){ - setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic"); - continue; - } - - if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) { - setAppliedRule(p2, "WB3d Keep horizontal whitespace together."); - continue; - } - - if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && - (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { - setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)"); - continue; - } - - if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && - (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && - (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { - setAppliedRule(p2, - "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)"); - continue; - } - - if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && - (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && - (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { - setAppliedRule(p2, - "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)"); - continue; - } - - if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { - setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote"); - continue; - } - - if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { - setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter"); - continue; - } - - if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { - setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter"); - continue; - } - - if (fNumericSet->contains(c1) && - fNumericSet->contains(c2)) { - setAppliedRule(p2, "WB8 Numeric x Numeric"); - continue; - } - - if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && - fNumericSet->contains(c2)) { - setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric"); - continue; - } - - if (fNumericSet->contains(c1) && - (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { - setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)"); - continue; - } - - if (fNumericSet->contains(c0) && - (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && - fNumericSet->contains(c2)) { - setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric"); - continue; - } + } +} - if (fNumericSet->contains(c1) && - (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && - fNumericSet->contains(c3)) { - setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric"); - continue; +int32_t RBBIMonkeyKind::next(int32_t startPos) { + for (std::size_t i = startPos + 1; i < resolved.size(); ++i) { + if (resolved[i].appliedRule != nullptr && + resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) { + return i; } + } + return -1; +} - // Note: matches UAX 29 rules, but doesn't come into play for ICU because - // all Katakana are handled by the dictionary breaker. - if (fKatakanaSet->contains(c1) && - fKatakanaSet->contains(c2)) { - setAppliedRule(p2, "WB13 Katakana x Katakana"); - continue; - } +std::vector &RBBIMonkeyKind::characterClassNames() { + return classNames; +} - if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || - fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && - fExtendNumLetSet->contains(c2)) { - setAppliedRule(p2, - "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet"); - continue; - } +void RBBIMonkeyKind::prepareAppliedRules(int32_t size) { + // Remove all the information in the `appliedRules`. + appliedRules.clear(); + appliedRules.resize(size + 1); +} - if (fExtendNumLetSet->contains(c1) && - (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || - fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { - setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)"); - continue; - } +void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) { + appliedRules[position] = value; +} - if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { - setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators."); - break; - } - if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { - setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators."); - continue; - } +std::string RBBIMonkeyKind::getAppliedRule(int32_t position){ + return appliedRules[position]; +} - setAppliedRule(p2, "WB999"); - break; +std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) { + // Simply iterate through charClasses to find character's class + for (std::size_t aClassNum = 0; aClassNum < sets.size(); aClassNum++) { + const UnicodeSet &classSet = sets[aClassNum]; + if (classSet.contains(c)) { + return classNames[aClassNum]; + } } + U_ASSERT(false); // This should not happen. + return "bad class name"; +} - breakPos = p2; - return breakPos; +unsigned int RBBIMonkeyKind::maxClassNameSize() { + unsigned int maxSize = 0; + for (std::size_t aClassNum = 0; aClassNum < classNames.size(); aClassNum++) { + auto aClassNumSize = static_cast(classNames[aClassNum].size()); + if (aClassNumSize > maxSize) { + maxSize = aClassNumSize; + } + } + return maxSize; } +//---------------------------------------------------------------------------------------- +// +// Random Numbers. We need a long cycle length since we run overnight tests over +// millions of strings involving 1000 random generations per string +// (a 32-bit LCG will not do!), and we want a reasonably small state +// so that we can output it to reproduce failures. +// +//--------------------------------------------------------------------------------------- +namespace { -const std::vector& RBBIWordMonkey::charClasses() { - return sets; -} +using RandomNumberGenerator = std::ranlux48; +constexpr RandomNumberGenerator::result_type defaultSeed = std::ranlux48_base::default_seed; +static RandomNumberGenerator randomNumberGenerator; -RBBIWordMonkey::~RBBIWordMonkey() { - delete fCRSet; - delete fLFSet; - delete fNewlineSet; - delete fKatakanaSet; - delete fHebrew_LetterSet; - delete fALetterSet; - delete fSingle_QuoteSet; - delete fDouble_QuoteSet; - delete fMidNumLetSet; - delete fMidLetterSet; - delete fMidNumSet; - delete fNumericSet; - delete fFormatSet; - delete fExtendSet; - delete fExtendNumLetSet; - delete fWSegSpaceSet; - delete fRegionalIndicatorSet; - delete fDictionarySet; - delete fOtherSet; - delete fZWJSet; - delete fExtendedPictSet; +RandomNumberGenerator deserialize(const std::string& state) { + RandomNumberGenerator result; + std::stringstream(state) >> result; + return result; } +std::string serialize(const RandomNumberGenerator& generator) { + std::stringstream result; + result << generator; + return result.str(); +} - +} // namespace //------------------------------------------------------------------------------------------ // -// class RBBISentMonkey Sentence Break specific implementation +// class RBBICharMonkey Character (Grapheme Cluster) specific implementation // of RBBIMonkeyKind. // //------------------------------------------------------------------------------------------ -class RBBISentMonkey: public RBBIMonkeyKind { +class RBBICharMonkey: public RBBIMonkeyKind { public: - RBBISentMonkey(); - virtual ~RBBISentMonkey(); - virtual const std::vector& charClasses() override; - virtual void setText(const UnicodeString &s) override; - virtual int32_t next(int32_t i) override; -private: - int moveBack(int posFrom); - int moveForward(int posFrom); - UChar32 cAt(int pos); - - std::vector sets; - - UnicodeSet *fSepSet; - UnicodeSet *fFormatSet; - UnicodeSet *fSpSet; - UnicodeSet *fLowerSet; - UnicodeSet *fUpperSet; - UnicodeSet *fOLetterSet; - UnicodeSet *fNumericSet; - UnicodeSet *fATermSet; - UnicodeSet *fSContinueSet; - UnicodeSet *fSTermSet; - UnicodeSet *fCloseSet; - UnicodeSet *fOtherSet; - UnicodeSet *fExtendSet; - - const UnicodeString *fText; + RBBICharMonkey(); }; -RBBISentMonkey::RBBISentMonkey() -{ - UErrorCode status = U_ZERO_ERROR; - - // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator - // set and made into character classes of their own. For the monkey impl, - // they remain in SEP, since Sep always appears with CR and LF in the rules. - fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); - fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); - fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); - fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); - fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); - fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); - fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); - fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); - fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); - fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); - fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); - fOtherSet = new UnicodeSet(); - - if(U_FAILURE(status)) { - deferredStatus = status; - return; - } - - fOtherSet->complement(); - fOtherSet->removeAll(*fSepSet); - fOtherSet->removeAll(*fFormatSet); - fOtherSet->removeAll(*fSpSet); - fOtherSet->removeAll(*fLowerSet); - fOtherSet->removeAll(*fUpperSet); - fOtherSet->removeAll(*fOLetterSet); - fOtherSet->removeAll(*fNumericSet); - fOtherSet->removeAll(*fATermSet); - fOtherSet->removeAll(*fSContinueSet); - fOtherSet->removeAll(*fSTermSet); - fOtherSet->removeAll(*fCloseSet); - fOtherSet->removeAll(*fExtendSet); - - sets.emplace_back(*fSepSet); classNames.emplace_back("Sep"); - sets.emplace_back(*fFormatSet); classNames.emplace_back("Format"); - sets.emplace_back(*fSpSet); classNames.emplace_back("Sp"); - sets.emplace_back(*fLowerSet); classNames.emplace_back("Lower"); - sets.emplace_back(*fUpperSet); classNames.emplace_back("Upper"); - sets.emplace_back(*fOLetterSet); classNames.emplace_back("OLetter"); - sets.emplace_back(*fNumericSet); classNames.emplace_back("Numeric"); - sets.emplace_back(*fATermSet); classNames.emplace_back("ATerm"); - sets.emplace_back(*fSContinueSet); classNames.emplace_back("SContinue"); - sets.emplace_back(*fSTermSet); classNames.emplace_back("STerm"); - sets.emplace_back(*fCloseSet); classNames.emplace_back("Close"); - sets.emplace_back(*fOtherSet); classNames.emplace_back("Other"); - sets.emplace_back(*fExtendSet); classNames.emplace_back("Extend"); - - if (U_FAILURE(status)) { - deferredStatus = status; - } -} - - -void RBBISentMonkey::setText(const UnicodeString &s) { - fText = &s; - prepareAppliedRules(s.length()); -} +RBBICharMonkey::RBBICharMonkey() { + UErrorCode status = U_ZERO_ERROR; -const std::vector& RBBISentMonkey::charClasses() { - return sets; -} + std::list> partition; -// moveBack() Find the "significant" code point preceding the index i. -// Skips over ($Extend | $Format)* . -// -int RBBISentMonkey::moveBack(int i) { - if (i <= 0) { - return -1; - } - UChar32 c; - int32_t j = i; - do { - j = fText->moveIndex32(j, -1); - c = fText->char32At(j); - } - while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); - return j; + // These two could be part of the rules. + rules.push_back(std::make_unique(uR"(GB1 sot ÷ Any)", uR"(^)", u'÷', uR"()")); + // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead. + // The generated rules use the same (?!.). + rules.push_back(std::make_unique(uR"(GB2 Any ÷ eot)", uR"()", u'÷', uR"((?!.))")); - } + // --- NOLI ME TANGERE --- + // Generated by GenerateBreakTest.java in the Unicode tools. + partition.emplace_back("CR", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=CR}])", status)); + partition.emplace_back("LF", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=LF}])", status)); + partition.emplace_back("Control", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Control}])", status)); + partition.emplace_back("Extend_ConjunctLinker", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Extend}&\p{Indic_Conjunct_Break=Linker}])", status)); + partition.emplace_back("Extend_ConjunctExtendermConjunctLinker", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Extend}&[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]-\p{Indic_Conjunct_Break=Linker}])", status)); + partition.emplace_back("ExtendmConjunctLinkermConjunctExtender", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Extend}-\p{Indic_Conjunct_Break=Linker}-[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]])", status)); + partition.emplace_back("ZWJ", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=ZWJ}])", status)); + partition.emplace_back("RI", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Regional_Indicator}])", status)); + partition.emplace_back("Prepend", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Prepend}])", status)); + partition.emplace_back("SpacingMark", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=SpacingMark}])", status)); + partition.emplace_back("L", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=L}])", status)); + partition.emplace_back("V", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=V}])", status)); + partition.emplace_back("T", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=T}])", status)); + partition.emplace_back("LV", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=LV}])", status)); + partition.emplace_back("LVT", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=LVT}])", status)); + partition.emplace_back("LinkingConsonant", UnicodeSet(uR"([\p{Indic_Conjunct_Break=Consonant}])", status)); + partition.emplace_back("ExtPict", UnicodeSet(uR"([\p{Extended_Pictographic}])", status)); + partition.emplace_back("XXmLinkingConsonantmExtPict", UnicodeSet(uR"([\p{Grapheme_Cluster_Break=Other}-\p{Indic_Conjunct_Break=Consonant}-\p{Extended_Pictographic}])", status)); + + rules.push_back(std::make_unique(uR"($CR × $LF)", uR"(\p{Grapheme_Cluster_Break=CR})", u'×', uR"(\p{Grapheme_Cluster_Break=LF})")); + rules.push_back(std::make_unique(uR"(( $Control | $CR | $LF ) ÷)", uR"(( \p{Grapheme_Cluster_Break=Control} | \p{Grapheme_Cluster_Break=CR} | \p{Grapheme_Cluster_Break=LF} ))", u'÷', uR"()")); + rules.push_back(std::make_unique(uR"(÷ ( $Control | $CR | $LF ))", uR"()", u'÷', uR"(( \p{Grapheme_Cluster_Break=Control} | \p{Grapheme_Cluster_Break=CR} | \p{Grapheme_Cluster_Break=LF} ))")); + rules.push_back(std::make_unique(uR"($L × ( $L | $V | $LV | $LVT ))", uR"(\p{Grapheme_Cluster_Break=L})", u'×', uR"(( \p{Grapheme_Cluster_Break=L} | \p{Grapheme_Cluster_Break=V} | \p{Grapheme_Cluster_Break=LV} | \p{Grapheme_Cluster_Break=LVT} ))")); + rules.push_back(std::make_unique(uR"(( $LV | $V ) × ( $V | $T ))", uR"(( \p{Grapheme_Cluster_Break=LV} | \p{Grapheme_Cluster_Break=V} ))", u'×', uR"(( \p{Grapheme_Cluster_Break=V} | \p{Grapheme_Cluster_Break=T} ))")); + rules.push_back(std::make_unique(uR"(( $LVT | $T) × $T)", uR"(( \p{Grapheme_Cluster_Break=LVT} | \p{Grapheme_Cluster_Break=T}))", u'×', uR"(\p{Grapheme_Cluster_Break=T})")); + rules.push_back(std::make_unique(uR"(× ($Extend | $ZWJ))", uR"()", u'×', uR"((\p{Grapheme_Cluster_Break=Extend} | \p{Grapheme_Cluster_Break=ZWJ}))")); + rules.push_back(std::make_unique(uR"(× $SpacingMark)", uR"()", u'×', uR"(\p{Grapheme_Cluster_Break=SpacingMark})")); + rules.push_back(std::make_unique(uR"($Prepend ×)", uR"(\p{Grapheme_Cluster_Break=Prepend})", u'×', uR"()")); + rules.push_back(std::make_unique(uR"($LinkingConsonant $ConjunctExtender* $ConjunctLinker $ConjunctExtender* × $LinkingConsonant)", uR"(\p{Indic_Conjunct_Break=Consonant} [\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]* \p{Indic_Conjunct_Break=Linker} [\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]*)", u'×', uR"(\p{Indic_Conjunct_Break=Consonant})")); + rules.push_back(std::make_unique(uR"($ExtPict $Extend* $ZWJ × $ExtPict)", uR"(\p{Extended_Pictographic} \p{Grapheme_Cluster_Break=Extend}* \p{Grapheme_Cluster_Break=ZWJ})", u'×', uR"(\p{Extended_Pictographic})")); + rules.push_back(std::make_unique(uR"(^ ($RI $RI)* $RI × $RI)", uR"(^ (\p{Grapheme_Cluster_Break=Regional_Indicator} \p{Grapheme_Cluster_Break=Regional_Indicator})* \p{Grapheme_Cluster_Break=Regional_Indicator})", u'×', uR"(\p{Grapheme_Cluster_Break=Regional_Indicator})")); + rules.push_back(std::make_unique(uR"([^$RI] ($RI $RI)* $RI × $RI)", uR"([^\p{Grapheme_Cluster_Break=Regional_Indicator}] (\p{Grapheme_Cluster_Break=Regional_Indicator} \p{Grapheme_Cluster_Break=Regional_Indicator})* \p{Grapheme_Cluster_Break=Regional_Indicator})", u'×', uR"(\p{Grapheme_Cluster_Break=Regional_Indicator})")); + // --- End of generated code. --- + // TODO(egg): This could just as well be part of the rules… + rules.push_back(std::make_unique(uR"(ALL ÷ / ÷ ALL)", uR"()", u'÷', uR"()")); -int RBBISentMonkey::moveForward(int i) { - if (i>=fText->length()) { - return fText->length(); - } - UChar32 c; - int32_t j = i; - do { - j = fText->moveIndex32(j, 1); - c = cAt(j); + for (const auto &[name, set] : partition) { + sets.push_back(set); + classNames.push_back(name); } - while (fFormatSet->contains(c) || fExtendSet->contains(c)); - return j; -} -UChar32 RBBISentMonkey::cAt(int pos) { - if (pos<0 || pos>=fText->length()) { - return -1; - } else { - return fText->char32At(pos); + if (U_FAILURE(status)) { + deferredStatus = status; } } -int32_t RBBISentMonkey::next(int32_t prevPos) { - int p0, p1, p2, p3; // Indices of the significant code points around the - // break position being tested. The candidate break - // location is before p2. - - int breakPos = -1; - - UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. - UChar32 c; - - if (U_FAILURE(deferredStatus)) { - return -1; - } - - // Prev break at end of string. return DONE. - if (prevPos >= fText->length()) { - return -1; - } - p0 = p1 = p2 = p3 = prevPos; - c3 = fText->char32At(prevPos); - c0 = c1 = c2 = 0; - (void)p0; // Suppress set but not used warning. - - // Loop runs once per "significant" character position in the input text. - for (;;) { - // Move all of the positions forward in the input string. - p0 = p1; c0 = c1; - p1 = p2; c1 = c2; - p2 = p3; c2 = c3; - - // Advance p3 by X(Extend | Format)* Rule 4 - p3 = moveForward(p3); - c3 = cAt(p3); - - if (c1==0x0d && c2==0x0a && p2==(p1+1)) { - setAppliedRule(p2, "SB3 CR x LF"); - continue; - } +//------------------------------------------------------------------------------------------ +// +// class RBBIWordMonkey Word Break specific implementation +// of RBBIMonkeyKind. +// +//------------------------------------------------------------------------------------------ +class RBBIWordMonkey: public RBBIMonkeyKind { +public: + RBBIWordMonkey(); +}; - if (fSepSet->contains(c1)) { - p2 = p1+1; // Separators don't combine with Extend or Format. - setAppliedRule(p2, "SB4 Sep "); - break; - } +RBBIWordMonkey::RBBIWordMonkey() +{ + UErrorCode status = U_ZERO_ERROR; - if (p2 >= fText->length()) { - // Reached end of string. Always a break position. - setAppliedRule(p2, "SB4 Sep "); - break; - } + std::list> partition; - if (p2 == prevPos) { - // Still warming up the loop. (won't work with zero length strings, but we don't care) - setAppliedRule(p2, "SB4 Sep "); - continue; - } + dictionarySet_ = UnicodeSet(uR"([[\uac00-\ud7a3][:Han:][:Hiragana:]])", status); + dictionarySet_.addAll(UnicodeSet(uR"([\p{Word_Break = Katakana}])", status)); + dictionarySet_.addAll(UnicodeSet(uR"([\p{LineBreak = Complex_Context}])", status)); - if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { - setAppliedRule(p2, "SB6 ATerm x Numeric"); - continue; - } + // These two could be part of the rules. + rules.push_back(std::make_unique(uR"(WB1 sot ÷ Any)", uR"(^)", u'÷', uR"()")); + // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead. + // The generated rules use the same (?!.). + rules.push_back(std::make_unique(uR"(WB2 Any ÷ eot)", uR"()", u'÷', uR"((?!.))")); - if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && - fATermSet->contains(c1) && fUpperSet->contains(c2)) { - setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper"); - continue; - } + // --- NOLI ME TANGERE --- + // Generated by GenerateBreakTest.java in the Unicode tools. + partition.emplace_back("CR", UnicodeSet(uR"([\p{Word_Break=CR}])", status)); + partition.emplace_back("LF", UnicodeSet(uR"([\p{Word_Break=LF}])", status)); + partition.emplace_back("Newline", UnicodeSet(uR"([\p{Word_Break=Newline}])", status)); + partition.emplace_back("Extend", UnicodeSet(uR"([\p{Word_Break=Extend}])", status)); + partition.emplace_back("Format", UnicodeSet(uR"([[\p{Word_Break=Format}]])", status)); + partition.emplace_back("Katakana", UnicodeSet(uR"([\p{Word_Break=Katakana}])", status)); + partition.emplace_back("ALetter_ExtPict", UnicodeSet(uR"([\p{Word_Break=ALetter}&\p{Extended_Pictographic}])", status)); + partition.emplace_back("ALettermExtPict", UnicodeSet(uR"([\p{Word_Break=ALetter}-\p{Extended_Pictographic}])", status)); + partition.emplace_back("MidLetter", UnicodeSet(uR"([\p{Word_Break=MidLetter}])", status)); + partition.emplace_back("MidNum", UnicodeSet(uR"([\p{Word_Break=MidNum}])", status)); + partition.emplace_back("MidNumLet", UnicodeSet(uR"([\p{Word_Break=MidNumLet}])", status)); + partition.emplace_back("Numeric", UnicodeSet(uR"([\p{Word_Break=Numeric}])", status)); + partition.emplace_back("ExtendNumLet", UnicodeSet(uR"([\p{Word_Break=ExtendNumLet}])", status)); + partition.emplace_back("RI", UnicodeSet(uR"([\p{Word_Break=Regional_Indicator}])", status)); + partition.emplace_back("Hebrew_Letter", UnicodeSet(uR"([\p{Word_Break=Hebrew_Letter}])", status)); + partition.emplace_back("Double_Quote", UnicodeSet(uR"([\p{Word_Break=Double_Quote}])", status)); + partition.emplace_back("Single_Quote", UnicodeSet(uR"([\p{Word_Break=Single_Quote}])", status)); + partition.emplace_back("ZWJ", UnicodeSet(uR"([\p{Word_Break=ZWJ}])", status)); + partition.emplace_back("ExtPictmALetter", UnicodeSet(uR"([\p{Extended_Pictographic}-\p{Word_Break=ALetter}])", status)); + partition.emplace_back("WSegSpace", UnicodeSet(uR"([\p{Word_Break=WSegSpace}])", status)); + partition.emplace_back("XXmExtPict", UnicodeSet(uR"([\p{Word_Break=Other}-\p{Extended_Pictographic}])", status)); + + rules.push_back(std::make_unique(uR"($CR × $LF)", uR"(\p{Word_Break=CR})", u'×', uR"(\p{Word_Break=LF})")); + rules.push_back(std::make_unique(uR"(($Newline | $CR | $LF) ÷)", uR"((\p{Word_Break=Newline} | \p{Word_Break=CR} | \p{Word_Break=LF}))", u'÷', uR"()")); + rules.push_back(std::make_unique(uR"(÷ ($Newline | $CR | $LF))", uR"()", u'÷', uR"((\p{Word_Break=Newline} | \p{Word_Break=CR} | \p{Word_Break=LF}))")); + rules.push_back(std::make_unique(uR"($ZWJ × $ExtPict)", uR"(\p{Word_Break=ZWJ})", u'×', uR"(\p{Extended_Pictographic})")); + rules.push_back(std::make_unique(uR"($WSegSpace × $WSegSpace)", uR"(\p{Word_Break=WSegSpace})", u'×', uR"(\p{Word_Break=WSegSpace})")); + rules.push_back(std::make_unique(uR"((?[^$CR $LF $Newline]) ($Extend | $Format | $ZWJ)* → ${X})", uR"((?[^\p{Word_Break=CR} \p{Word_Break=LF} \p{Word_Break=Newline}]) (\p{Word_Break=Extend} | [\p{Word_Break=Format}] | \p{Word_Break=ZWJ})*)", uR"(${X})")); + rules.push_back(std::make_unique(uR"($AHLetter × $AHLetter)", uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])", u'×', uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])")); + rules.push_back(std::make_unique(uR"($AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter)", uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])", u'×', uR"((\p{Word_Break=MidLetter} | [\p{Word_Break=MidNumLet} \p{Word_Break=Single_Quote}]) [\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])")); + rules.push_back(std::make_unique(uR"($AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter)", uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}] (\p{Word_Break=MidLetter} | [\p{Word_Break=MidNumLet} \p{Word_Break=Single_Quote}]))", u'×', uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])")); + rules.push_back(std::make_unique(uR"($Hebrew_Letter × $Single_Quote)", uR"(\p{Word_Break=Hebrew_Letter})", u'×', uR"(\p{Word_Break=Single_Quote})")); + rules.push_back(std::make_unique(uR"($Hebrew_Letter × $Double_Quote $Hebrew_Letter)", uR"(\p{Word_Break=Hebrew_Letter})", u'×', uR"(\p{Word_Break=Double_Quote} \p{Word_Break=Hebrew_Letter})")); + rules.push_back(std::make_unique(uR"($Hebrew_Letter $Double_Quote × $Hebrew_Letter)", uR"(\p{Word_Break=Hebrew_Letter} \p{Word_Break=Double_Quote})", u'×', uR"(\p{Word_Break=Hebrew_Letter})")); + rules.push_back(std::make_unique(uR"($Numeric × $Numeric)", uR"(\p{Word_Break=Numeric})", u'×', uR"(\p{Word_Break=Numeric})")); + rules.push_back(std::make_unique(uR"($AHLetter × $Numeric)", uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])", u'×', uR"(\p{Word_Break=Numeric})")); + rules.push_back(std::make_unique(uR"($Numeric × $AHLetter)", uR"(\p{Word_Break=Numeric})", u'×', uR"([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}])")); + rules.push_back(std::make_unique(uR"($Numeric ($MidNum | $MidNumLetQ) × $Numeric)", uR"(\p{Word_Break=Numeric} (\p{Word_Break=MidNum} | [\p{Word_Break=MidNumLet} \p{Word_Break=Single_Quote}]))", u'×', uR"(\p{Word_Break=Numeric})")); + rules.push_back(std::make_unique(uR"($Numeric × ($MidNum | $MidNumLetQ) $Numeric)", uR"(\p{Word_Break=Numeric})", u'×', uR"((\p{Word_Break=MidNum} | [\p{Word_Break=MidNumLet} \p{Word_Break=Single_Quote}]) \p{Word_Break=Numeric})")); + rules.push_back(std::make_unique(uR"($Katakana × $Katakana)", uR"(\p{Word_Break=Katakana})", u'×', uR"(\p{Word_Break=Katakana})")); + rules.push_back(std::make_unique(uR"(($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet)", uR"(([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}] | \p{Word_Break=Numeric} | \p{Word_Break=Katakana} | \p{Word_Break=ExtendNumLet}))", u'×', uR"(\p{Word_Break=ExtendNumLet})")); + rules.push_back(std::make_unique(uR"($ExtendNumLet × ($AHLetter | $Numeric | $Katakana))", uR"(\p{Word_Break=ExtendNumLet})", u'×', uR"(([\p{Word_Break=ALetter} \p{Word_Break=Hebrew_Letter}] | \p{Word_Break=Numeric} | \p{Word_Break=Katakana}))")); + rules.push_back(std::make_unique(uR"(^ ($RI $RI)* $RI × $RI)", uR"(^ (\p{Word_Break=Regional_Indicator} \p{Word_Break=Regional_Indicator})* \p{Word_Break=Regional_Indicator})", u'×', uR"(\p{Word_Break=Regional_Indicator})")); + rules.push_back(std::make_unique(uR"([^$RI] ($RI $RI)* $RI × $RI)", uR"([^\p{Word_Break=Regional_Indicator}] (\p{Word_Break=Regional_Indicator} \p{Word_Break=Regional_Indicator})* \p{Word_Break=Regional_Indicator})", u'×', uR"(\p{Word_Break=Regional_Indicator})")); + // --- End of generated code. --- - // Note: STerm | ATerm are added to the negated part of the expression by a - // note to the Unicode 5.0 documents. - int p8 = p1; - while (fSpSet->contains(cAt(p8))) { - p8 = moveBack(p8); - } - while (fCloseSet->contains(cAt(p8))) { - p8 = moveBack(p8); - } - if (fATermSet->contains(cAt(p8))) { - p8=p2; - for (;;) { - c = cAt(p8); - if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || - fLowerSet->contains(c) || fSepSet->contains(c) || - fATermSet->contains(c) || fSTermSet->contains(c)) { + // TODO(egg): This could just as well be part of the rules… + rules.push_back(std::make_unique(uR"(ALL ÷ / ÷ ALL)", uR"()", u'÷', uR"()")); - setAppliedRule(p2, - "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* "); - break; - } - p8 = moveForward(p8); - } - if (fLowerSet->contains(cAt(p8))) { + for (const auto &[name, set] : partition) { + sets.push_back(set); + classNames.push_back(name); + } - setAppliedRule(p2, - "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* "); - continue; - } - } + if (U_FAILURE(status)) { + deferredStatus = status; + } +} - if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { - p8 = p1; - while (fSpSet->contains(cAt(p8))) { - p8 = moveBack(p8); - } - while (fCloseSet->contains(cAt(p8))) { - p8 = moveBack(p8); - } - c = cAt(p8); - if (fSTermSet->contains(c) || fATermSet->contains(c)) { - setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)"); - continue; - } - } +//------------------------------------------------------------------------------------------ +// +// class RBBISentMonkey Sentence Break specific implementation +// of RBBIMonkeyKind. +// +//------------------------------------------------------------------------------------------ +class RBBISentMonkey: public RBBIMonkeyKind { +public: + RBBISentMonkey(); +}; - int p9 = p1; - while (fCloseSet->contains(cAt(p9))) { - p9 = moveBack(p9); - } - c = cAt(p9); - if ((fSTermSet->contains(c) || fATermSet->contains(c))) { - if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { +RBBISentMonkey::RBBISentMonkey() +{ + UErrorCode status = U_ZERO_ERROR; - setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)"); - continue; - } - } + std::list> partition; - int p10 = p1; - while (fSpSet->contains(cAt(p10))) { - p10 = moveBack(p10); - } - while (fCloseSet->contains(cAt(p10))) { - p10 = moveBack(p10); - } - if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { - if (fSpSet->contains(c2) || fSepSet->contains(c2)) { - setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)"); - continue; - } - } + // These two could be part of the rules. + rules.push_back(std::make_unique(uR"(SB1 sot ÷ Any)", uR"(^)", u'÷', uR"()")); + // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead. + // The generated rules use the same (?!.). + rules.push_back(std::make_unique(uR"(SB2 Any ÷ eot)", uR"()", u'÷', uR"((?!.))")); - int p11 = p1; - if (fSepSet->contains(cAt(p11))) { - p11 = moveBack(p11); - } - while (fSpSet->contains(cAt(p11))) { - p11 = moveBack(p11); - } - while (fCloseSet->contains(cAt(p11))) { - p11 = moveBack(p11); - } - if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { - setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? "); - break; - } + // --- NOLI ME TANGERE --- + // Generated by GenerateBreakTest.java in the Unicode tools. + partition.emplace_back("CR", UnicodeSet(uR"([\p{Sentence_Break=CR}])", status)); + partition.emplace_back("LF", UnicodeSet(uR"([\p{Sentence_Break=LF}])", status)); + partition.emplace_back("Extend", UnicodeSet(uR"([\p{Sentence_Break=Extend}])", status)); + partition.emplace_back("Format", UnicodeSet(uR"([\p{Sentence_Break=Format}])", status)); + partition.emplace_back("Sep", UnicodeSet(uR"([\p{Sentence_Break=Sep}])", status)); + partition.emplace_back("Sp", UnicodeSet(uR"([\p{Sentence_Break=Sp}])", status)); + partition.emplace_back("Lower", UnicodeSet(uR"([\p{Sentence_Break=Lower}])", status)); + partition.emplace_back("Upper", UnicodeSet(uR"([\p{Sentence_Break=Upper}])", status)); + partition.emplace_back("OLetter", UnicodeSet(uR"([\p{Sentence_Break=OLetter}])", status)); + partition.emplace_back("Numeric", UnicodeSet(uR"([\p{Sentence_Break=Numeric}])", status)); + partition.emplace_back("ATerm", UnicodeSet(uR"([\p{Sentence_Break=ATerm}])", status)); + partition.emplace_back("STerm", UnicodeSet(uR"([\p{Sentence_Break=STerm}])", status)); + partition.emplace_back("Close", UnicodeSet(uR"([\p{Sentence_Break=Close}])", status)); + partition.emplace_back("SContinue", UnicodeSet(uR"([\p{Sentence_Break=SContinue}])", status)); + partition.emplace_back("XX", UnicodeSet(uR"([\p{Sentence_Break=Other}])", status)); + + rules.push_back(std::make_unique(uR"($CR × $LF)", uR"(\p{Sentence_Break=CR})", u'×', uR"(\p{Sentence_Break=LF})")); + rules.push_back(std::make_unique(uR"($ParaSep ÷)", uR"([\p{Sentence_Break=Sep} \p{Sentence_Break=CR} \p{Sentence_Break=LF}])", u'÷', uR"()")); + rules.push_back(std::make_unique(uR"((?[^$ParaSep]) ( $Extend | $Format )* → ${X})", uR"((?[^[\p{Sentence_Break=Sep} \p{Sentence_Break=CR} \p{Sentence_Break=LF}]]) ( \p{Sentence_Break=Extend} | \p{Sentence_Break=Format} )*)", uR"(${X})")); + rules.push_back(std::make_unique(uR"($ATerm × $Numeric)", uR"(\p{Sentence_Break=ATerm})", u'×', uR"(\p{Sentence_Break=Numeric})")); + rules.push_back(std::make_unique(uR"(($Upper | $Lower) $ATerm × $Upper)", uR"((\p{Sentence_Break=Upper} | \p{Sentence_Break=Lower}) \p{Sentence_Break=ATerm})", u'×', uR"(\p{Sentence_Break=Upper})")); + rules.push_back(std::make_unique(uR"($ATerm $Close* $Sp* × [^ $OLetter $Upper $Lower $ParaSep $SATerm]* $Lower)", uR"(\p{Sentence_Break=ATerm} \p{Sentence_Break=Close}* \p{Sentence_Break=Sp}*)", u'×', uR"([^ \p{Sentence_Break=OLetter} \p{Sentence_Break=Upper} \p{Sentence_Break=Lower} [\p{Sentence_Break=Sep} \p{Sentence_Break=CR} \p{Sentence_Break=LF}] [\p{Sentence_Break=STerm} \p{Sentence_Break=ATerm}]]* \p{Sentence_Break=Lower})")); + rules.push_back(std::make_unique(uR"($SATerm $Close* $Sp* × ($SContinue | $SATerm))", uR"([\p{Sentence_Break=STerm} \p{Sentence_Break=ATerm}] \p{Sentence_Break=Close}* \p{Sentence_Break=Sp}*)", u'×', uR"((\p{Sentence_Break=SContinue} | [\p{Sentence_Break=STerm} \p{Sentence_Break=ATerm}]))")); + rules.push_back(std::make_unique(uR"($SATerm $Close* × ( $Close | $Sp | $ParaSep ))", uR"([\p{Sentence_Break=STerm} \p{Sentence_Break=ATerm}] \p{Sentence_Break=Close}*)", u'×', uR"(( \p{Sentence_Break=Close} | \p{Sentence_Break=Sp} | [\p{Sentence_Break=Sep} \p{Sentence_Break=CR} \p{Sentence_Break=LF}] ))")); + rules.push_back(std::make_unique(uR"($SATerm $Close* $Sp* × ( $Sp | $ParaSep ))", uR"([\p{Sentence_Break=STerm} \p{Sentence_Break=ATerm}] \p{Sentence_Break=Close}* \p{Sentence_Break=Sp}*)", u'×', uR"(( \p{Sentence_Break=Sp} | [\p{Sentence_Break=Sep} \p{Sentence_Break=CR} \p{Sentence_Break=LF}] ))")); + rules.push_back(std::make_unique(uR"($SATerm $Close* $Sp* $ParaSep? ÷)", uR"([\p{Sentence_Break=STerm} \p{Sentence_Break=ATerm}] \p{Sentence_Break=Close}* \p{Sentence_Break=Sp}* [\p{Sentence_Break=Sep} \p{Sentence_Break=CR} \p{Sentence_Break=LF}]?)", u'÷', uR"()")); + rules.push_back(std::make_unique(uR"(× $Any)", uR"()", u'×', uR"(.)")); + // --- End of generated code. --- - setAppliedRule(p2, "SB12 Any x Any"); + for (const auto &[name, set] : partition) { + sets.push_back(set); + classNames.push_back(name); } - breakPos = p2; - return breakPos; -} - -RBBISentMonkey::~RBBISentMonkey() { - delete fSepSet; - delete fFormatSet; - delete fSpSet; - delete fLowerSet; - delete fUpperSet; - delete fOLetterSet; - delete fNumericSet; - delete fATermSet; - delete fSContinueSet; - delete fSTermSet; - delete fCloseSet; - delete fOtherSet; - delete fExtendSet; + if (U_FAILURE(status)) { + deferredStatus = status; + } } //------------------------------------------------------------------------------------------- @@ -2873,25 +2206,11 @@ RBBISentMonkey::~RBBISentMonkey() { class RBBILineMonkey: public RBBIMonkeyKind { public: RBBILineMonkey(); - virtual ~RBBILineMonkey(); - virtual const std::vector& charClasses() override; - virtual void setText(const UnicodeString &s) override; - virtual int32_t next(int32_t i) override; private: - std::vector sets; - std::vector> rules; - std::vector resolved; - - BreakIterator *fCharBI; - const UnicodeString *fText; }; RBBILineMonkey::RBBILineMonkey() : - RBBIMonkeyKind(), - - fCharBI(nullptr), - fText(nullptr) - + RBBIMonkeyKind() { if (U_FAILURE(deferredStatus)) { return; @@ -2906,20 +2225,11 @@ RBBILineMonkey::RBBILineMonkey() : std::list> partition; - // TODO(egg): The following two workarounds for what seems to be ICU bugs; - // with UREGEX_DOTALL (but not UREGEX_MULTILINE): - // 1. /.*\u000A/ does not match CR LF; - // 2. /$/ matches ( BK | CR | LF | NL ) eot. - rules.push_back(std::make_unique(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()")); - rules.push_back(std::make_unique( - uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)", - uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])", - u'×', - uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)")); - rules.push_back(std::make_unique(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()")); // This one could be part of the rules. - rules.push_back(std::make_unique(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)")); + // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead. + // The generated rules use the same (?!.). + rules.push_back(std::make_unique(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))")); // --- NOLI ME TANGERE --- // Generated by GenerateBreakTest.java in the Unicode tools. @@ -3015,7 +2325,7 @@ RBBILineMonkey::RBBILineMonkey() : rules.push_back(std::make_unique(uR"(× $CP)", uR"()", u'×', uR"(\p{Line_Break=CP})")); rules.push_back(std::make_unique(uR"(× $SY)", uR"()", u'×', uR"(\p{Line_Break=Break_Symbols})")); rules.push_back(std::make_unique(uR"($OP $SP* ×)", uR"(\p{Line_Break=Open_Punctuation} \p{Line_Break=Space}*)", u'×', uR"()")); - rules.push_back(std::make_unique(uR"(( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()")); + rules.push_back(std::make_unique(uR"(( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | ^ ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()")); rules.push_back(std::make_unique(uR"(× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ))", uR"()", u'×', uR"([\p{Line_Break=Quotation} && \p{gc=Pf}] ( \p{Line_Break=Space} | \p{Line_Break=Glue} | \p{Line_Break=Word_Joiner} | \p{Line_Break=Close_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=CP} | \p{Line_Break=Exclamation} | \p{Line_Break=Infix_Numeric} | \p{Line_Break=Break_Symbols} | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=ZWSpace} | (?!.) ))")); rules.push_back(std::make_unique(uR"($SP ÷ $IS $NU)", uR"(\p{Line_Break=Space})", u'÷', uR"(\p{Line_Break=Infix_Numeric} \p{Line_Break=Numeric})")); rules.push_back(std::make_unique(uR"(× $IS)", uR"()", u'×', uR"(\p{Line_Break=Infix_Numeric})")); @@ -3027,10 +2337,10 @@ RBBILineMonkey::RBBILineMonkey() : rules.push_back(std::make_unique(uR"([^$EastAsian] × $QU)", uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])", u'×', uR"(\p{Line_Break=Quotation})")); rules.push_back(std::make_unique(uR"(× $QU ( [^$EastAsian] | $eot ))", uR"()", u'×', uR"(\p{Line_Break=Quotation} ( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | (?!.) ))")); rules.push_back(std::make_unique(uR"($QU × [^$EastAsian])", uR"(\p{Line_Break=Quotation})", u'×', uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])")); - rules.push_back(std::make_unique(uR"(( $sot | [^$EastAsian] ) $QU ×)", uR"(( ^ | [^[\p{ea=F}\p{ea=W}\p{ea=H}]] ) \p{Line_Break=Quotation})", u'×', uR"()")); + rules.push_back(std::make_unique(uR"(( [^$EastAsian] | $sot ) $QU ×)", uR"(( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | ^ ) \p{Line_Break=Quotation})", u'×', uR"()")); rules.push_back(std::make_unique(uR"(÷ $CB)", uR"()", u'÷', uR"(\p{Line_Break=Contingent_Break})")); rules.push_back(std::make_unique(uR"($CB ÷)", uR"(\p{Line_Break=Contingent_Break})", u'÷', uR"()")); - rules.push_back(std::make_unique(uR"(( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])")); + rules.push_back(std::make_unique(uR"(( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} | ^ ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])")); rules.push_back(std::make_unique(uR"(× $BA)", uR"()", u'×', uR"(\p{Line_Break=Break_After})")); rules.push_back(std::make_unique(uR"(× $HY)", uR"()", u'×', uR"(\p{Line_Break=Hyphen})")); rules.push_back(std::make_unique(uR"(× $NS)", uR"()", u'×', uR"([\p{Line_Break=Nonstarter} \p{Line_Break=Conditional_Japanese_Starter}])")); @@ -3080,82 +2390,22 @@ RBBILineMonkey::RBBILineMonkey() : // --- End of generated code. --- + // TODO(egg): This could just as well be part of the rules… rules.push_back(std::make_unique(uR"(ALL ÷ / ÷ ALL)", uR"()", u'÷', uR"()")); - const UnicodeSet lbSA(uR"(\p{lb=SA})", status); - for (auto it = partition.begin(); it != partition.end();) { - if (lbSA.containsAll(it->second)) { - it = partition.erase(it); - } else { - ++it; - } - } + dictionarySet_ = UnicodeSet(uR"(\p{lb=SA})", status); for (const auto &[name, set] : partition) { sets.push_back(set); classNames.push_back(name); } - fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); - if (U_FAILURE(status)) { deferredStatus = status; } - -} - -void RBBILineMonkey::setText(const UnicodeString &s) { - fText = &s; - fCharBI->setText(s); - prepareAppliedRules(s.length()); - UnicodeString remapped = s; - resolved.clear(); - resolved.reserve(s.length() + 1); - for (int i = 0; i < s.length() + 1; ++i) { - resolved.emplace_back(i); - } - for (const auto& rule : rules) { - rule->apply(remapped, resolved); - } - for (std::size_t i = 0; i < resolved.size(); ++i) { - if (resolved[i].appliedRule == nullptr) { - printf("Failed to resolve at %zu" , i); - std::terminate(); - } else { - setAppliedRule(i, resolved[i].appliedRule->name().c_str()); - } - } -} - -int32_t RBBILineMonkey::next(int32_t startPos) { - for (std::size_t i = startPos + 1; i < resolved.size(); ++i) { - if (resolved[i].appliedRule != nullptr && - resolved[i].appliedRule->resolution() == SegmentationRule::BREAK) { - return i; - } - } - return -1; -} - - -const std::vector& RBBILineMonkey::charClasses() { - return sets; -} - - -RBBILineMonkey::~RBBILineMonkey() { - constexpr bool debuggingOldMonkeyPerformance = false; - if (debuggingOldMonkeyPerformance) { - for (auto const &rule : rules) { - puts((rule->name() + " : " + std::to_string(rule->timeSpent() / std::chrono::milliseconds(1)) + - " ms").c_str()); - } - } - - delete fCharBI; } @@ -3343,6 +2593,16 @@ void RBBITest::TestWordBreaks() UnicodeString ustr = CharsToUnicodeString(strlist[loop]); // RBBICharMonkey monkey; RBBIWordMonkey monkey; + if (monkey.dictionarySet().containsSome(ustr)) { + // Some of these twenty-year-old random examples depend on the + // monkey tests not looking across dictionary/non-dictionary + // boundaries for context when applying the rules. + // The monkeys are not designed to work with dictionary characters, + // so this behaviour is out of scope for testing against the + // monkeys. + logKnownIssue("ICU-22984"); + continue; + } int expected[50]; int expectedcount = 0; @@ -3847,6 +3107,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name errln("%s:%d c < 0", __FILE__, __LINE__); break; } + if (mk.dictionarySet().contains(c)) { + continue; + } if (scalarsOnly && U16_IS_SURROGATE(c)) { continue; } diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 9f85b7917139..e2154abf6309 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -176,7 +176,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_cj.txt b/icu4c/source/test/testdata/break_rules/line_cj.txt index 7aad76ecf107..bb0a6880ea29 100644 --- a/icu4c/source/test/testdata/break_rules/line_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_cj.txt @@ -180,7 +180,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 72e7563c9274..f9152060bf2d 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -181,7 +181,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 99d01874d1fb..b04236532bbd 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -200,7 +200,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA BAX HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index 211298539797..c7c518d5b68b 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -182,7 +182,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index 2061f9170848..cfa9c7968e1b 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -186,7 +186,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 1c7fe9975699..781ce068be7b 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -2214,3 +2214,7 @@ Bangkok)• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. +# A hyphen following non-breaking space that carries an intervening combining +# mark is treated as word-initial; by LB20a it has no break opportunity after +# it. A bug in ICU 76 incorrectly handled that case (ICU-22986). +• ̄-k• \ No newline at end of file diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk index 8d0172d055cb..ab4a491c49bd 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk index dbbbc0dfbae1..ca6e43ba4247 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk index 9f77680c2835..31c737abf247 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk index 4199ddeda1cf..0608fb2f4de9 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk index bebfe7285a2f..bba43a75fc6c 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk index 0229e2cb2f2d..371a487782a9 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk index 9b13706bfb58..47f829ce0340 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk index 7cbc69987714..49fa8e0416b3 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk index b9f1fa48e7d7..3920cf5e5dba 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk differ diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 56a4801bea29..5b3489e4369e 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -19,10 +19,12 @@ import org.junit.runners.JUnit4; // Monkey testing of RuleBasedBreakIterator. -// The old, original monkey test. TODO: remove +// The old monkey test, now using regexes generated by the Unicode tools. // The new monkey test is class RBBIMonkeyTest. import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.dev.test.rbbi.SegmentationRule.BreakContext; +import com.ibm.icu.dev.test.rbbi.SegmentationRule.Resolution; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.BreakIterator; @@ -676,166 +678,15 @@ int next(int prevPos) { static class RBBILineMonkey extends RBBIMonkeyKind { - // UnicodeSets for each of the Line Breaking character classes. - // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier - // to verify that they are all accounted for. - - // XUnicodeSet is like UnicodeSet, except that the method contains(int codePoint) does not - // throw exceptions on out-of-range codePoints. This matches ICU4C behavior. - // The LineMonkey test (ported from ICU4C) relies on this behavior, it uses a value of -1 - // to represent a non-codepoint that is not included in any of the property sets. - // This happens for rule 30a. - class XUnicodeSet extends UnicodeSet { - XUnicodeSet(String pattern) { super(pattern); } - XUnicodeSet() { super(); } - @Override - public boolean contains(int codePoint) { - return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ? - false : super.contains(codePoint); - } - } - - // Declare these variables as XUnicodeSet, not merely as UnicodeSet, - // so that when we copy a new declaration from C++ (where only UnicodeSet exists), - // the missing 'X' prefix is visible; - // and when the prefix is there and we copy a new initializer we get a compiler error. - // (Otherwise we rely on the caller catching the IAE from using codePoint=-1 - // and failing with a message that tells us what to do.) - XUnicodeSet fBK; - XUnicodeSet fCR; - XUnicodeSet fLF; - XUnicodeSet fCM; - XUnicodeSet fNL; - XUnicodeSet fSG; - XUnicodeSet fWJ; - XUnicodeSet fZW; - XUnicodeSet fGL; - XUnicodeSet fSP; - XUnicodeSet fB2; - XUnicodeSet fBA; - XUnicodeSet fBB; - XUnicodeSet fHH; - XUnicodeSet fHY; - XUnicodeSet fCB; - XUnicodeSet fCL; - XUnicodeSet fCP; - XUnicodeSet fEX; - XUnicodeSet fIN; - XUnicodeSet fNS; - XUnicodeSet fOP; - XUnicodeSet fQU; - XUnicodeSet fIS; - XUnicodeSet fNU; - XUnicodeSet fPO; - XUnicodeSet fPR; - XUnicodeSet fSY; - XUnicodeSet fAI; - XUnicodeSet fAL; - XUnicodeSet fCJ; - XUnicodeSet fH2; - XUnicodeSet fH3; - XUnicodeSet fHL; - XUnicodeSet fID; - XUnicodeSet fJL; - XUnicodeSet fJV; - XUnicodeSet fJT; - XUnicodeSet fRI; - XUnicodeSet fXX; - XUnicodeSet fEB; - XUnicodeSet fEM; - XUnicodeSet fZWJ; - XUnicodeSet fOP30; - XUnicodeSet fCP30; - XUnicodeSet fExtPictUnassigned; - XUnicodeSet fAK; - XUnicodeSet fAP; - XUnicodeSet fAS; - XUnicodeSet fVF; - XUnicodeSet fVI; - XUnicodeSet fPi; - XUnicodeSet fPf; - XUnicodeSet feaFWH; + List rules; + SegmentationRule.BreakContext[] resolved; StringBuffer fText; - int fOrigPositions; RBBILineMonkey() { fCharProperty = UProperty.LINE_BREAK; - fBK = new XUnicodeSet("[\\p{Line_Break=BK}]"); - fCR = new XUnicodeSet("[\\p{Line_break=CR}]"); - fLF = new XUnicodeSet("[\\p{Line_break=LF}]"); - fCM = new XUnicodeSet("[\\p{Line_break=CM}]"); - fNL = new XUnicodeSet("[\\p{Line_break=NL}]"); - fSG = new XUnicodeSet("[\\ud800-\\udfff]"); - fWJ = new XUnicodeSet("[\\p{Line_break=WJ}]"); - fZW = new XUnicodeSet("[\\p{Line_break=ZW}]"); - fGL = new XUnicodeSet("[\\p{Line_break=GL}]"); - fSP = new XUnicodeSet("[\\p{Line_break=SP}]"); - fB2 = new XUnicodeSet("[\\p{Line_break=B2}]"); - fBA = new XUnicodeSet("[\\p{Line_break=BA}]"); - fBB = new XUnicodeSet("[\\p{Line_break=BB}]"); - fHH = new XUnicodeSet(); - fHY = new XUnicodeSet("[\\p{Line_break=HY}]"); - fCB = new XUnicodeSet("[\\p{Line_break=CB}]"); - fCL = new XUnicodeSet("[\\p{Line_break=CL}]"); - fCP = new XUnicodeSet("[\\p{Line_break=CP}]"); - fEX = new XUnicodeSet("[\\p{Line_break=EX}]"); - fIN = new XUnicodeSet("[\\p{Line_break=IN}]"); - fNS = new XUnicodeSet("[\\p{Line_break=NS}]"); - fOP = new XUnicodeSet("[\\p{Line_break=OP}]"); - fQU = new XUnicodeSet("[\\p{Line_break=QU}]"); - fIS = new XUnicodeSet("[\\p{Line_break=IS}]"); - fNU = new XUnicodeSet("[\\p{Line_break=NU}]"); - fPO = new XUnicodeSet("[\\p{Line_break=PO}]"); - fPR = new XUnicodeSet("[\\p{Line_break=PR}]"); - fSY = new XUnicodeSet("[\\p{Line_break=SY}]"); - fAI = new XUnicodeSet("[\\p{Line_break=AI}]"); - fAL = new XUnicodeSet("[\\p{Line_break=AL}]"); - fCJ = new XUnicodeSet("[\\p{Line_break=CJ}]"); - fH2 = new XUnicodeSet("[\\p{Line_break=H2}]"); - fH3 = new XUnicodeSet("[\\p{Line_break=H3}]"); - fHL = new XUnicodeSet("[\\p{Line_break=HL}]"); - fID = new XUnicodeSet("[\\p{Line_break=ID}]"); - fJL = new XUnicodeSet("[\\p{Line_break=JL}]"); - fJV = new XUnicodeSet("[\\p{Line_break=JV}]"); - fJT = new XUnicodeSet("[\\p{Line_break=JT}]"); - fRI = new XUnicodeSet("[\\p{Line_break=RI}]"); - fXX = new XUnicodeSet("[\\p{Line_break=XX}]"); - fEB = new XUnicodeSet("[\\p{Line_break=EB}]"); - fEM = new XUnicodeSet("[\\p{Line_break=EM}]"); - fZWJ = new XUnicodeSet("[\\p{Line_break=ZWJ}]"); - fOP30 = new XUnicodeSet("[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"); - fCP30 = new XUnicodeSet("[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"); - fExtPictUnassigned = new XUnicodeSet("[\\p{Extended_Pictographic}&\\p{Cn}]"); - fAK = new XUnicodeSet("[\\p{Line_Break=AK}]"); - fAP = new XUnicodeSet("[\\p{Line_Break=AP}]"); - fAS = new XUnicodeSet("[\\p{Line_Break=AS}]"); - fVF = new XUnicodeSet("[\\p{Line_Break=VF}]"); - fVI = new XUnicodeSet("[\\p{Line_Break=VI}]"); - - fPi = new XUnicodeSet("[\\p{Pi}]"); - fPf = new XUnicodeSet("[\\p{Pf}]"); - - feaFWH = new XUnicodeSet("[\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); - - // Remove dictionary characters. - // The monkey test reference implementation of line break does not replicate the dictionary behavior, - // so dictionary characters are omitted from the monkey test data. - @SuppressWarnings("unused") - UnicodeSet dictionarySet = new UnicodeSet( - "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); - - fAL.addAll(fXX); // Default behavior for XX is identical to AL - fAL.addAll(fAI); // Default behavior for AI is identical to AL - fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL - - fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. - fCM.addAll(fZWJ); // ZWJ behaves as a CM. - - fHH.add('\u2010'); // Hyphen, '‐' - class NamedSet { String name; UnicodeSet set; @@ -847,40 +698,181 @@ class NamedSet { this(name, new UnicodeSet(pattern)); } }; - - final List interestingSets = new ArrayList<>(); - interestingSets.add(new NamedSet("eastAsian", "[\\p{ea=F}\\p{ea=W}\\p{ea=H}]")); - interestingSets.add(new NamedSet("Pi", "\\p{Pi}")); - interestingSets.add(new NamedSet("Pf", "\\p{Pf}")); - interestingSets.add(new NamedSet("DOTTEDC.", "[◌]")); - interestingSets.add(new NamedSet("HYPHEN", "[\\u2010]")); - interestingSets.add(new NamedSet("ExtPictCn", "[\\p{Extended_Pictographic}&\\p{Cn}]")); - final List partition = new ArrayList<>(); - for (int lb = 0; lb < UCharacter.LineBreak.COUNT; ++lb) { - final String lbValueShortName = - UCharacter.getPropertyValueName(UProperty.LINE_BREAK, lb, UProperty.NameChoice.SHORT); - if (lbValueShortName.equals("SA")) { + List partition = new ArrayList<>(); + rules = new ArrayList<>(); + + rules.add(new RegexRule("sot ÷ contra LB2", "^", Resolution.BREAK, "")); + // This one could be part of the rules. + // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead. + // The generated rules use the same (?!.). + rules.add(new RegexRule("LB3 ÷ eot", "", Resolution.BREAK, "(?!.)")); + + // --- NOLI ME TANGERE --- + // Generated by GenerateBreakTest.java in the Unicode tools. + partition.add(new NamedSet("AI_EastAsian", new UnicodeSet("[\\p{Line_Break=Ambiguous}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("AImEastAsian", new UnicodeSet("[\\p{Line_Break=Ambiguous}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("AK", new UnicodeSet("[\\p{Line_Break=Aksara}]"))); + partition.add(new NamedSet("ALorig_EastAsian", new UnicodeSet("[\\p{Line_Break=Alphabetic}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("ALorig_DottedCircle", new UnicodeSet("[\\p{Line_Break=Alphabetic}&[◌]]"))); + partition.add(new NamedSet("ALorigmEastAsianmDottedCircle", new UnicodeSet("[\\p{Line_Break=Alphabetic}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]-[◌]]"))); + partition.add(new NamedSet("AP", new UnicodeSet("[\\p{Line_Break=Aksara_Prebase}]"))); + partition.add(new NamedSet("AS", new UnicodeSet("[\\p{Line_Break=Aksara_Start}]"))); + partition.add(new NamedSet("B2", new UnicodeSet("[\\p{Line_Break=Break_Both}]"))); + partition.add(new NamedSet("BA_EastAsian", new UnicodeSet("[\\p{Line_Break=Break_After}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("BA_Hyphen", new UnicodeSet("[\\p{Line_Break=Break_After}&[\\u2010]]"))); + partition.add(new NamedSet("BAmEastAsianmHyphen", new UnicodeSet("[\\p{Line_Break=Break_After}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]-[\\u2010]]"))); + partition.add(new NamedSet("BB", new UnicodeSet("[\\p{Line_Break=Break_Before}]"))); + partition.add(new NamedSet("BK", new UnicodeSet("[\\p{Line_Break=Mandatory_Break}]"))); + partition.add(new NamedSet("CB", new UnicodeSet("[\\p{Line_Break=Contingent_Break}]"))); + partition.add(new NamedSet("CL_EastAsian", new UnicodeSet("[\\p{Line_Break=Close_Punctuation}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("CLmEastAsian", new UnicodeSet("[\\p{Line_Break=Close_Punctuation}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("CP", new UnicodeSet("[\\p{Line_Break=CP}]"))); + partition.add(new NamedSet("CMorig_EastAsian", new UnicodeSet("[\\p{Line_Break=Combining_Mark}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("CMorigmEastAsian", new UnicodeSet("[\\p{Line_Break=Combining_Mark}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("CR", new UnicodeSet("[\\p{Line_Break=Carriage_Return}]"))); + partition.add(new NamedSet("EX_EastAsian", new UnicodeSet("[\\p{Line_Break=Exclamation}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("EXmEastAsian", new UnicodeSet("[\\p{Line_Break=Exclamation}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("GL_EastAsian", new UnicodeSet("[\\p{Line_Break=Glue}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("GLmEastAsian", new UnicodeSet("[\\p{Line_Break=Glue}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("H2", new UnicodeSet("[\\p{Line_Break=H2}]"))); + partition.add(new NamedSet("H3", new UnicodeSet("[\\p{Line_Break=H3}]"))); + partition.add(new NamedSet("HL", new UnicodeSet("[\\p{Line_Break=HL}]"))); + partition.add(new NamedSet("HY", new UnicodeSet("[\\p{Line_Break=Hyphen}]"))); + partition.add(new NamedSet("ID_EastAsian", new UnicodeSet("[\\p{Line_Break=Ideographic}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("ID_ExtPictUnassigned", new UnicodeSet("[\\p{Line_Break=Ideographic}&[\\p{Extended_Pictographic}&\\p{gc=Cn}]]"))); + partition.add(new NamedSet("IDmEastAsianmExtPictUnassigned", new UnicodeSet("[\\p{Line_Break=Ideographic}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]-[\\p{Extended_Pictographic}&\\p{gc=Cn}]]"))); + partition.add(new NamedSet("IN_EastAsian", new UnicodeSet("[\\p{Line_Break=Inseparable}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("INmEastAsian", new UnicodeSet("[\\p{Line_Break=Inseparable}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("IS", new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]"))); + partition.add(new NamedSet("JL", new UnicodeSet("[\\p{Line_Break=JL}]"))); + partition.add(new NamedSet("JT", new UnicodeSet("[\\p{Line_Break=JT}]"))); + partition.add(new NamedSet("JV", new UnicodeSet("[\\p{Line_Break=JV}]"))); + partition.add(new NamedSet("LF", new UnicodeSet("[\\p{Line_Break=Line_Feed}]"))); + partition.add(new NamedSet("NL", new UnicodeSet("[\\p{Line_Break=Next_Line}]"))); + partition.add(new NamedSet("NSorig_EastAsian", new UnicodeSet("[\\p{Line_Break=Nonstarter}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("NSorigmEastAsian", new UnicodeSet("[\\p{Line_Break=Nonstarter}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("NU", new UnicodeSet("[\\p{Line_Break=Numeric}]"))); + partition.add(new NamedSet("OP_EastAsian", new UnicodeSet("[\\p{Line_Break=Open_Punctuation}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("OPmEastAsian", new UnicodeSet("[\\p{Line_Break=Open_Punctuation}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("PO_EastAsian", new UnicodeSet("[\\p{Line_Break=Postfix_Numeric}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("POmEastAsian", new UnicodeSet("[\\p{Line_Break=Postfix_Numeric}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("PR_EastAsian", new UnicodeSet("[\\p{Line_Break=Prefix_Numeric}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("PRmEastAsian", new UnicodeSet("[\\p{Line_Break=Prefix_Numeric}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("QU_Pi", new UnicodeSet("[\\p{Line_Break=Quotation}&\\p{gc=Pi}]"))); + partition.add(new NamedSet("QU_Pf", new UnicodeSet("[\\p{Line_Break=Quotation}&\\p{gc=Pf}]"))); + partition.add(new NamedSet("QUmPimPf", new UnicodeSet("[\\p{Line_Break=Quotation}-\\p{gc=Pi}-\\p{gc=Pf}]"))); + partition.add(new NamedSet("SA_Mn", new UnicodeSet("[[\\p{Line_Break=Complex_Context}&\\p{gc=Mn}]]"))); + partition.add(new NamedSet("SA_Mc", new UnicodeSet("[[\\p{Line_Break=Complex_Context}&\\p{gc=Mc}]]"))); + partition.add(new NamedSet("SAmMnmMc", new UnicodeSet("[[\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]]"))); + partition.add(new NamedSet("SG", new UnicodeSet("[\\p{Line_Break=Surrogate}]"))); + partition.add(new NamedSet("SP", new UnicodeSet("[\\p{Line_Break=Space}]"))); + partition.add(new NamedSet("SY", new UnicodeSet("[\\p{Line_Break=Break_Symbols}]"))); + partition.add(new NamedSet("VF", new UnicodeSet("[\\p{Line_Break=Virama_Final}]"))); + partition.add(new NamedSet("VI", new UnicodeSet("[\\p{Line_Break=Virama}]"))); + partition.add(new NamedSet("WJ", new UnicodeSet("[\\p{Line_Break=Word_Joiner}]"))); + partition.add(new NamedSet("XX_ExtPictUnassigned", new UnicodeSet("[\\p{Line_Break=Unknown}&[\\p{Extended_Pictographic}&\\p{gc=Cn}]]"))); + partition.add(new NamedSet("XXmExtPictUnassigned", new UnicodeSet("[\\p{Line_Break=Unknown}-[\\p{Extended_Pictographic}&\\p{gc=Cn}]]"))); + partition.add(new NamedSet("ZW", new UnicodeSet("[\\p{Line_Break=ZWSpace}]"))); + partition.add(new NamedSet("CJ", new UnicodeSet("[\\p{Line_Break=Conditional_Japanese_Starter}]"))); + partition.add(new NamedSet("RI", new UnicodeSet("[\\p{Line_Break=Regional_Indicator}]"))); + partition.add(new NamedSet("EB_EastAsian", new UnicodeSet("[\\p{Line_Break=E_Base}&[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("EBmEastAsian", new UnicodeSet("[\\p{Line_Break=E_Base}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]"))); + partition.add(new NamedSet("EM", new UnicodeSet("[\\p{Line_Break=E_Modifier}]"))); + partition.add(new NamedSet("ZWJ", new UnicodeSet("[\\p{Line_Break=ZWJ}]"))); + + rules.add(new RegexRule("$BK ÷", "\\p{Line_Break=Mandatory_Break}", Resolution.BREAK, "")); + rules.add(new RegexRule("$CR × $LF", "\\p{Line_Break=Carriage_Return}", Resolution.NO_BREAK, "\\p{Line_Break=Line_Feed}")); + rules.add(new RegexRule("$CR ÷", "\\p{Line_Break=Carriage_Return}", Resolution.BREAK, "")); + rules.add(new RegexRule("$LF ÷", "\\p{Line_Break=Line_Feed}", Resolution.BREAK, "")); + rules.add(new RegexRule("$NL ÷", "\\p{Line_Break=Next_Line}", Resolution.BREAK, "")); + rules.add(new RegexRule("× ( $BK | $CR | $LF | $NL )", "", Resolution.NO_BREAK, "( \\p{Line_Break=Mandatory_Break} | \\p{Line_Break=Carriage_Return} | \\p{Line_Break=Line_Feed} | \\p{Line_Break=Next_Line} )")); + rules.add(new RegexRule("× $SP", "", Resolution.NO_BREAK, "\\p{Line_Break=Space}")); + rules.add(new RegexRule("× $ZW", "", Resolution.NO_BREAK, "\\p{Line_Break=ZWSpace}")); + rules.add(new RegexRule("$ZW $SP* ÷", "\\p{Line_Break=ZWSpace} \\p{Line_Break=Space}*", Resolution.BREAK, "")); + rules.add(new RegexRule("$ZWJ ×", "\\p{Line_Break=ZWJ}", Resolution.NO_BREAK, "")); + rules.add(new RemapRule("(?[^$BK $CR $LF $NL $SP $ZW]) ( $CM | $ZWJ )* → ${X}", "(?[^\\p{Line_Break=Mandatory_Break} \\p{Line_Break=Carriage_Return} \\p{Line_Break=Line_Feed} \\p{Line_Break=Next_Line} \\p{Line_Break=Space} \\p{Line_Break=ZWSpace}]) ( [\\p{Line_Break=Combining_Mark} [\\p{Line_Break=Complex_Context}&\\p{gc=Mn}] [\\p{Line_Break=Complex_Context}&\\p{gc=Mc}]] | \\p{Line_Break=ZWJ} )*", "${X}")); + rules.add(new RemapRule("( $CM | $ZWJ ) → A", "( [\\p{Line_Break=Combining_Mark} [\\p{Line_Break=Complex_Context}&\\p{gc=Mn}] [\\p{Line_Break=Complex_Context}&\\p{gc=Mc}]] | \\p{Line_Break=ZWJ} )", "A")); + rules.add(new RegexRule("× $WJ", "", Resolution.NO_BREAK, "\\p{Line_Break=Word_Joiner}")); + rules.add(new RegexRule("$WJ ×", "\\p{Line_Break=Word_Joiner}", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("$GL ×", "\\p{Line_Break=Glue}", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("[^ $SP $BA $HY] × $GL", "[^ \\p{Line_Break=Space} \\p{Line_Break=Break_After} \\p{Line_Break=Hyphen}]", Resolution.NO_BREAK, "\\p{Line_Break=Glue}")); + rules.add(new RegexRule("× $EX", "", Resolution.NO_BREAK, "\\p{Line_Break=Exclamation}")); + rules.add(new RegexRule("× $CL", "", Resolution.NO_BREAK, "\\p{Line_Break=Close_Punctuation}")); + rules.add(new RegexRule("× $CP", "", Resolution.NO_BREAK, "\\p{Line_Break=CP}")); + rules.add(new RegexRule("× $SY", "", Resolution.NO_BREAK, "\\p{Line_Break=Break_Symbols}")); + rules.add(new RegexRule("$OP $SP* ×", "\\p{Line_Break=Open_Punctuation} \\p{Line_Break=Space}*", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×", "( \\p{Line_Break=Mandatory_Break} | \\p{Line_Break=Carriage_Return} | \\p{Line_Break=Line_Feed} | \\p{Line_Break=Next_Line} | \\p{Line_Break=Open_Punctuation} | \\p{Line_Break=Quotation} | \\p{Line_Break=Glue} | \\p{Line_Break=Space} | \\p{Line_Break=ZWSpace} | ^ ) [\\p{Line_Break=Quotation} & \\p{gc=Pi}] \\p{Line_Break=Space}*", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot )", "", Resolution.NO_BREAK, "[\\p{Line_Break=Quotation} & \\p{gc=Pf}] ( \\p{Line_Break=Space} | \\p{Line_Break=Glue} | \\p{Line_Break=Word_Joiner} | \\p{Line_Break=Close_Punctuation} | \\p{Line_Break=Quotation} | \\p{Line_Break=CP} | \\p{Line_Break=Exclamation} | \\p{Line_Break=Infix_Numeric} | \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Mandatory_Break} | \\p{Line_Break=Carriage_Return} | \\p{Line_Break=Line_Feed} | \\p{Line_Break=Next_Line} | \\p{Line_Break=ZWSpace} | (?!.) )")); + rules.add(new RegexRule("$SP ÷ $IS $NU", "\\p{Line_Break=Space}", Resolution.BREAK, "\\p{Line_Break=Infix_Numeric} \\p{Line_Break=Numeric}")); + rules.add(new RegexRule("× $IS", "", Resolution.NO_BREAK, "\\p{Line_Break=Infix_Numeric}")); + rules.add(new RegexRule("($CL | $CP) $SP* × $NS", "(\\p{Line_Break=Close_Punctuation} | \\p{Line_Break=CP}) \\p{Line_Break=Space}*", Resolution.NO_BREAK, "[\\p{Line_Break=Nonstarter} \\p{Line_Break=Conditional_Japanese_Starter}]")); + rules.add(new RegexRule("$B2 $SP* × $B2", "\\p{Line_Break=Break_Both} \\p{Line_Break=Space}*", Resolution.NO_BREAK, "\\p{Line_Break=Break_Both}")); + rules.add(new RegexRule("$SP ÷", "\\p{Line_Break=Space}", Resolution.BREAK, "")); + rules.add(new RegexRule("× $QUmPi", "", Resolution.NO_BREAK, "[\\p{Line_Break=Quotation} - \\p{gc=Pi}]")); + rules.add(new RegexRule("$QUmPf ×", "[\\p{Line_Break=Quotation} - \\p{gc=Pf}]", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("[^$EastAsian] × $QU", "[^[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", Resolution.NO_BREAK, "\\p{Line_Break=Quotation}")); + rules.add(new RegexRule("× $QU ( [^$EastAsian] | $eot )", "", Resolution.NO_BREAK, "\\p{Line_Break=Quotation} ( [^[\\p{ea=F}\\p{ea=W}\\p{ea=H}]] | (?!.) )")); + rules.add(new RegexRule("$QU × [^$EastAsian]", "\\p{Line_Break=Quotation}", Resolution.NO_BREAK, "[^[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]")); + rules.add(new RegexRule("( [^$EastAsian] | $sot ) $QU ×", "( [^[\\p{ea=F}\\p{ea=W}\\p{ea=H}]] | ^ ) \\p{Line_Break=Quotation}", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("÷ $CB", "", Resolution.BREAK, "\\p{Line_Break=Contingent_Break}")); + rules.add(new RegexRule("$CB ÷", "\\p{Line_Break=Contingent_Break}", Resolution.BREAK, "")); + rules.add(new RegexRule("( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL", "( \\p{Line_Break=Mandatory_Break} | \\p{Line_Break=Carriage_Return} | \\p{Line_Break=Line_Feed} | \\p{Line_Break=Next_Line} | \\p{Line_Break=Space} | \\p{Line_Break=ZWSpace} | \\p{Line_Break=Contingent_Break} | \\p{Line_Break=Glue} | ^ ) ( \\p{Line_Break=Hyphen} | [\\u2010] )", Resolution.NO_BREAK, "[\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]]")); + rules.add(new RegexRule("× $BA", "", Resolution.NO_BREAK, "\\p{Line_Break=Break_After}")); + rules.add(new RegexRule("× $HY", "", Resolution.NO_BREAK, "\\p{Line_Break=Hyphen}")); + rules.add(new RegexRule("× $NS", "", Resolution.NO_BREAK, "[\\p{Line_Break=Nonstarter} \\p{Line_Break=Conditional_Japanese_Starter}]")); + rules.add(new RegexRule("$BB ×", "\\p{Line_Break=Break_Before}", Resolution.NO_BREAK, "")); + rules.add(new RegexRule("$HL ($HY | $NonEastAsianBA) × [^$HL]", "\\p{Line_Break=HL} (\\p{Line_Break=Hyphen} | [\\p{Line_Break=Break_After} & [^[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]])", Resolution.NO_BREAK, "[^\\p{Line_Break=HL}]")); + rules.add(new RegexRule("$SY × $HL", "\\p{Line_Break=Break_Symbols}", Resolution.NO_BREAK, "\\p{Line_Break=HL}")); + rules.add(new RegexRule("× $IN", "", Resolution.NO_BREAK, "\\p{Line_Break=Inseparable}")); + rules.add(new RegexRule("($AL | $HL) × $NU", "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})", Resolution.NO_BREAK, "\\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$NU × ($AL | $HL)", "\\p{Line_Break=Numeric}", Resolution.NO_BREAK, "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})")); + rules.add(new RegexRule("$PR × ($ID | $EB | $EM)", "\\p{Line_Break=Prefix_Numeric}", Resolution.NO_BREAK, "(\\p{Line_Break=Ideographic} | \\p{Line_Break=E_Base} | \\p{Line_Break=E_Modifier})")); + rules.add(new RegexRule("($ID | $EB | $EM) × $PO", "(\\p{Line_Break=Ideographic} | \\p{Line_Break=E_Base} | \\p{Line_Break=E_Modifier})", Resolution.NO_BREAK, "\\p{Line_Break=Postfix_Numeric}")); + rules.add(new RegexRule("($PR | $PO) × ($AL | $HL)", "(\\p{Line_Break=Prefix_Numeric} | \\p{Line_Break=Postfix_Numeric})", Resolution.NO_BREAK, "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})")); + rules.add(new RegexRule("($AL | $HL) × ($PR | $PO)", "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})", Resolution.NO_BREAK, "(\\p{Line_Break=Prefix_Numeric} | \\p{Line_Break=Postfix_Numeric})")); + rules.add(new RegexRule("$NU ( $SY | $IS )* $CL × $PO", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )* \\p{Line_Break=Close_Punctuation}", Resolution.NO_BREAK, "\\p{Line_Break=Postfix_Numeric}")); + rules.add(new RegexRule("$NU ( $SY | $IS )* $CP × $PO", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )* \\p{Line_Break=CP}", Resolution.NO_BREAK, "\\p{Line_Break=Postfix_Numeric}")); + rules.add(new RegexRule("$NU ( $SY | $IS )* $CL × $PR", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )* \\p{Line_Break=Close_Punctuation}", Resolution.NO_BREAK, "\\p{Line_Break=Prefix_Numeric}")); + rules.add(new RegexRule("$NU ( $SY | $IS )* $CP × $PR", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )* \\p{Line_Break=CP}", Resolution.NO_BREAK, "\\p{Line_Break=Prefix_Numeric}")); + rules.add(new RegexRule("$NU ( $SY | $IS )* × $PO", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )*", Resolution.NO_BREAK, "\\p{Line_Break=Postfix_Numeric}")); + rules.add(new RegexRule("$NU ( $SY | $IS )* × $PR", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )*", Resolution.NO_BREAK, "\\p{Line_Break=Prefix_Numeric}")); + rules.add(new RegexRule("$PO × $OP $NU", "\\p{Line_Break=Postfix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Open_Punctuation} \\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$PO × $OP $IS $NU", "\\p{Line_Break=Postfix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Open_Punctuation} \\p{Line_Break=Infix_Numeric} \\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$PO × $NU", "\\p{Line_Break=Postfix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$PR × $OP $NU", "\\p{Line_Break=Prefix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Open_Punctuation} \\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$PR × $OP $IS $NU", "\\p{Line_Break=Prefix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Open_Punctuation} \\p{Line_Break=Infix_Numeric} \\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$PR × $NU", "\\p{Line_Break=Prefix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$HY × $NU", "\\p{Line_Break=Hyphen}", Resolution.NO_BREAK, "\\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$IS × $NU", "\\p{Line_Break=Infix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$NU ( $SY | $IS )* × $NU", "\\p{Line_Break=Numeric} ( \\p{Line_Break=Break_Symbols} | \\p{Line_Break=Infix_Numeric} )*", Resolution.NO_BREAK, "\\p{Line_Break=Numeric}")); + rules.add(new RegexRule("$JL × $JL | $JV | $H2 | $H3", "\\p{Line_Break=JL}", Resolution.NO_BREAK, "\\p{Line_Break=JL} | \\p{Line_Break=JV} | \\p{Line_Break=H2} | \\p{Line_Break=H3}")); + rules.add(new RegexRule("$JV | $H2 × $JV | $JT", "\\p{Line_Break=JV} | \\p{Line_Break=H2}", Resolution.NO_BREAK, "\\p{Line_Break=JV} | \\p{Line_Break=JT}")); + rules.add(new RegexRule("$JT | $H3 × $JT", "\\p{Line_Break=JT} | \\p{Line_Break=H3}", Resolution.NO_BREAK, "\\p{Line_Break=JT}")); + rules.add(new RegexRule("$JL | $JV | $JT | $H2 | $H3 × $PO", "\\p{Line_Break=JL} | \\p{Line_Break=JV} | \\p{Line_Break=JT} | \\p{Line_Break=H2} | \\p{Line_Break=H3}", Resolution.NO_BREAK, "\\p{Line_Break=Postfix_Numeric}")); + rules.add(new RegexRule("$PR × $JL | $JV | $JT | $H2 | $H3", "\\p{Line_Break=Prefix_Numeric}", Resolution.NO_BREAK, "\\p{Line_Break=JL} | \\p{Line_Break=JV} | \\p{Line_Break=JT} | \\p{Line_Break=H2} | \\p{Line_Break=H3}")); + rules.add(new RegexRule("($AL | $HL) × ($AL | $HL)", "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})", Resolution.NO_BREAK, "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})")); + rules.add(new RegexRule("$AP × ($AK | $DottedCircle | $AS)", "\\p{Line_Break=Aksara_Prebase}", Resolution.NO_BREAK, "(\\p{Line_Break=Aksara} | [◌] | \\p{Line_Break=Aksara_Start})")); + rules.add(new RegexRule("($AK | $DottedCircle | $AS) × ($VF | $VI)", "(\\p{Line_Break=Aksara} | [◌] | \\p{Line_Break=Aksara_Start})", Resolution.NO_BREAK, "(\\p{Line_Break=Virama_Final} | \\p{Line_Break=Virama})")); + rules.add(new RegexRule("($AK | $DottedCircle | $AS) $VI × ($AK | $DottedCircle)", "(\\p{Line_Break=Aksara} | [◌] | \\p{Line_Break=Aksara_Start}) \\p{Line_Break=Virama}", Resolution.NO_BREAK, "(\\p{Line_Break=Aksara} | [◌])")); + rules.add(new RegexRule("($AK | $DottedCircle | $AS) × ($AK | $DottedCircle | $AS) $VF", "(\\p{Line_Break=Aksara} | [◌] | \\p{Line_Break=Aksara_Start})", Resolution.NO_BREAK, "(\\p{Line_Break=Aksara} | [◌] | \\p{Line_Break=Aksara_Start}) \\p{Line_Break=Virama_Final}")); + rules.add(new RegexRule("$IS × ($AL | $HL)", "\\p{Line_Break=Infix_Numeric}", Resolution.NO_BREAK, "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL})")); + rules.add(new RegexRule("($AL | $HL | $NU) × $OPmEastAsian", "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL} | \\p{Line_Break=Numeric})", Resolution.NO_BREAK, "[\\p{Line_Break=Open_Punctuation}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]")); + rules.add(new RegexRule("$CPmEastAsian × ($AL | $HL | $NU)", "[\\p{Line_Break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", Resolution.NO_BREAK, "([\\p{Line_Break=Ambiguous} \\p{Line_Break=Alphabetic} \\p{Line_Break=Surrogate} \\p{Line_Break=Unknown} [\\p{Line_Break=Complex_Context}-\\p{gc=Mn}-\\p{gc=Mc}]] | \\p{Line_Break=HL} | \\p{Line_Break=Numeric})")); + rules.add(new RegexRule("$sot ($RI $RI)* $RI × $RI", "^ (\\p{Line_Break=Regional_Indicator} \\p{Line_Break=Regional_Indicator})* \\p{Line_Break=Regional_Indicator}", Resolution.NO_BREAK, "\\p{Line_Break=Regional_Indicator}")); + rules.add(new RegexRule("[^$RI] ($RI $RI)* $RI × $RI", "[^\\p{Line_Break=Regional_Indicator}] (\\p{Line_Break=Regional_Indicator} \\p{Line_Break=Regional_Indicator})* \\p{Line_Break=Regional_Indicator}", Resolution.NO_BREAK, "\\p{Line_Break=Regional_Indicator}")); + rules.add(new RegexRule("$RI ÷ $RI", "\\p{Line_Break=Regional_Indicator}", Resolution.BREAK, "\\p{Line_Break=Regional_Indicator}")); + rules.add(new RegexRule("$EB × $EM", "\\p{Line_Break=E_Base}", Resolution.NO_BREAK, "\\p{Line_Break=E_Modifier}")); + rules.add(new RegexRule("$ExtPictUnassigned × $EM", "[\\p{Extended_Pictographic}&\\p{gc=Cn}]", Resolution.NO_BREAK, "\\p{Line_Break=E_Modifier}")); + // --- End of generated code. --- + + // TODO(egg): This could just as well be part of the rules… + rules.add(new RegexRule("(ALL ÷ / ÷ ALL)", "", Resolution.BREAK, "")); + + final UnicodeSet lbSA = new UnicodeSet("\\p{lb=SA}"); + for (final NamedSet part : partition) { + if (lbSA.containsAll(part.set)) { continue; } - partition.add(new NamedSet(lbValueShortName, "\\p{lb=" + lbValueShortName + "}")); - } - for (final NamedSet refinement : interestingSets) { - for (int i = 0; i < partition.size();) { - final String name = partition.get(i).name; - final UnicodeSet set = partition.get(i).set; - final UnicodeSet intersection = new UnicodeSet(set).retainAll(refinement.set); - final UnicodeSet complement = new UnicodeSet(set).removeAll(refinement.set); - if (!intersection.isEmpty() && !complement.isEmpty()) { - partition.add(i, new NamedSet(name, complement)); - partition.add(i + 1, new NamedSet(name + "&" + refinement.name, intersection)); - partition.remove(i + 2); - i += 2; - } else { - ++i; - } - } - } - for (final NamedSet part : partition) { fSets.add(part.set); fClassNames.add(part.name); } @@ -890,877 +882,32 @@ class NamedSet { void setText(StringBuffer s) { fText = s; prepareAppliedRules(s.length()); - } - - - - - @Override - int next(int startPos) { - int pos; // Index of the char following a potential break position - int thisChar; // Character at above position "pos" - - int prevPos; // Index of the char preceding a potential break position - int prevChar; // Character at above position. Note that prevChar - // // and thisChar may not be adjacent because combining - // // characters between them will be ignored. - - int prevPosX2; - int prevCharX2; // Character before prevChar, more context for LB 21a - - int nextPos; // Index of the next character following pos. - // // Usually skips over combining marks. - int tPos; // temp value. - int matchVals[] = null; // Number Expression Match Results - - - if (startPos >= fText.length()) { - return -1; + StringBuilder remapped = new StringBuilder(s.toString()); + resolved = new BreakContext[s.length() + 1]; + for (int i = 0; i < resolved.length; ++i) { + resolved[i] = new BreakContext(i); } - - - // Initial values for loop. Loop will run the first time without finding breaks, - // while the invalid values shift out and the "this" and - // "prev" positions are filled in with good values. - pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. - thisChar = prevChar = prevCharX2 = 0; - nextPos = startPos; - - - // Loop runs once per position in the test text, until a break position - // is found. In each iteration, we are testing for a possible break - // just preceding the character at index "pos". The character preceding - // this char is at position "prevPos"; because of combining sequences, - // "prevPos" can be arbitrarily far before "pos". - for (;;) { - // Advance to the next position to be tested. - prevPosX2 = prevPos; - prevCharX2 = prevChar; - prevPos = pos; - prevChar = thisChar; - pos = nextPos; - nextPos = moveIndex32(fText, pos, 1); - - if (pos >= fText.length()) { - setAppliedRule(pos, "LB 2 Break at end of text"); - break; - } - - // We do this rule out-of-order because the adjustment does - // not effect the way that rules LB 3 through LB 6 match, - // and doing it here rather than after LB 6 is substantially - // simpler when combining sequences do occur. - - - // LB 9 Keep combining sequences together. - // advance over any CM class chars at "pos", - // result is "nextPos" for the following loop iteration. - thisChar = UTF16.charAt(fText, pos); - if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || - thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { - for (;;) { - if (nextPos == fText.length()) { - break; - } - int nextChar = UTF16.charAt(fText, nextPos); - if (!fCM.contains(nextChar)) { - break; - } - nextPos = moveIndex32(fText, nextPos, 1); - } - } - - // LB 9 Treat X CM* as if it were X - // No explicit action required. - - // LB 10 Treat any remaining combining mark as lb=AL, ea=Na - if (fCM.contains(thisChar)) { - thisChar = 'A'; - } - - - // If the loop is still warming up - if we haven't shifted the initial - // -1 positions out of prevPos yet - loop back to advance the - // position in the input without any further looking for breaks. - if (prevPos == -1) { - setAppliedRule(pos, "LB 9 adjust for combining sequences."); - continue; - } - - if (fBK.contains(prevChar)) { - setAppliedRule(pos, "LB 4 Always break after hard line breaks"); - break; - } - - if (fCR.contains(prevChar) && fLF.contains(thisChar)) { - setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF"); - continue; - } - if (fCR.contains(prevChar) || - fLF.contains(prevChar) || - fNL.contains(prevChar)) { - setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF"); - break; - } - - if (fBK.contains(thisChar) || fCR.contains(thisChar) || - fLF.contains(thisChar) || fNL.contains(thisChar) ) { - setAppliedRule(pos, "LB 6 Don't break before hard line breaks"); - continue; - } - - - if (fSP.contains(thisChar)) { - setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space"); - continue; - } - - if (fZW.contains(thisChar)) { - setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space"); - continue; - } - - // ZW SP* ÷ - // Scan backwards from prevChar for SP* ZW - tPos = prevPos; - while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - if (fZW.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 8 Break after zero width space"); - break; - } - - // The monkey test's way of ignoring combining characters doesn't work - // for this rule. ZWJ is also a CM. Need to get the actual character - // preceding "thisChar", not ignoring combining marks, possibly ZWJ. - { - int prevC = fText.codePointBefore(pos); - if (fZWJ.contains(prevC)) { - setAppliedRule(pos, "LB 8a ZWJ x"); - continue; - } - } - - // appliedRule: "LB 9, 10"; // Already done, at top of loop."; - - - // x WJ - // WJ x - if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { - setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters."); - continue; - } - - - if (fGL.contains(prevChar)) { - setAppliedRule(pos, "LB 12 GL x"); - continue; - } - - if (!(fSP.contains(prevChar) || - fBA.contains(prevChar) || - fHY.contains(prevChar) ) && fGL.contains(thisChar)) { - setAppliedRule(pos, "LB 12a [^SP BA HY] x GL"); - continue; - } - - if (fCL.contains(thisChar) || - fCP.contains(thisChar) || - fEX.contains(thisChar) || - fSY.contains(thisChar)) { - setAppliedRule(pos, "LB 13 Don't break before closings"); - continue; - } - - // Scan backwards, checking for this sequence. - // The OP char could include combining marks, so we actually check for - // OP CM* SP* x - tPos = prevPos; - if (fSP.contains(prevChar)) { - while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { - tPos=moveIndex32(fText, tPos, -1); - } - } - while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { - tPos=moveIndex32(fText, tPos, -1); - } - if (fOP.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 14 Don't break after OP SP*"); - continue; - } - - // Same as LB 14, scan backward for - // (sot | BK | CR | LF | NL | OP CM*| QU CM* | GL CM* | SP) [\p{Pi}&QU] CM* SP*. - tPos = prevPos; - // SP* (with the aforementioned Twist). - if (fSP.contains(prevChar)) { - while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - } - // CM*. - while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - // [\p{Pi}&QU]. - if (fPi.contains(UTF16.charAt(fText, tPos)) && fQU.contains(UTF16.charAt(fText, tPos))) { - if (tPos == 0) { - setAppliedRule(pos, "LB 15a sot [\\p{Pi}&QU] SP* ×"); - continue; - } else { - tPos = moveIndex32(fText, tPos, -1); - if (fBK.contains(UTF16.charAt(fText, tPos)) || fCR.contains(UTF16.charAt(fText, tPos)) || - fLF.contains(UTF16.charAt(fText, tPos)) || fNL.contains(UTF16.charAt(fText, tPos)) || - fSP.contains(UTF16.charAt(fText, tPos)) || fZW.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 15a (BK | CR | LF | NL | SP | ZW) [\\p{Pi}&QU] SP* ×"); - continue; - } - } - // CM*. - while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - if (fOP.contains(UTF16.charAt(fText, tPos)) || fQU.contains(UTF16.charAt(fText, tPos)) || - fGL.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 15a (OP | QU | GL) [\\p{Pi}&QU] SP* ×"); - continue; - } - } - - if (fPf.contains(thisChar) && fQU.contains(thisChar)) { - int nextChar = (nextPos < fText.length())? UTF16.charAt(fText, nextPos): 0; - if (nextPos == fText.length() || fSP.contains(nextChar) || fGL.contains(nextChar) || - fWJ.contains(nextChar) || fCL.contains(nextChar) || fQU.contains(nextChar) || - fCP.contains(nextChar) || fEX.contains(nextChar) || fIS.contains(nextChar) || - fSY.contains(nextChar) || fBK.contains(nextChar) || fCR.contains(nextChar) || - fLF.contains(nextChar) || fNL.contains(nextChar) || fZW.contains(nextChar)) { - setAppliedRule(pos, "LB 15b × [\\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | eot)"); - continue; - } - } - - if (nextPos < fText.length()) { - int nextChar = fText.codePointAt(nextPos); - if (fSP.contains(prevChar) && fIS.contains(thisChar) && fNU.contains(nextChar)) { - setAppliedRule(pos, "LB 15c Break before an IS that begins a number and follows a space"); - break; - } - } - - if (fIS.contains(thisChar)) { - setAppliedRule(pos, "LB 15d Do not break before numeric separators, even after spaces"); - continue; - } - - if (fNS.contains(thisChar)) { - tPos = prevPos; - while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS"); - continue; - } - } - - - if (fB2.contains(thisChar)) { - tPos = prevPos; - while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - if (fB2.contains(UTF16.charAt(fText, tPos))) { - setAppliedRule(pos, "LB 17 B2 SP* x B2"); - continue; - } - } - - if (fSP.contains(prevChar)) { - setAppliedRule(pos, "LB 18 break after space"); - break; - } - - // LB 19 - // × [QU-\p{Pi}] - if (fQU.contains(thisChar) && !fPi.contains(thisChar)) { - setAppliedRule(pos, "LB 19 × [QU-\\p{Pi}]"); - continue; - } - // [QU-\p{Pf}] × - if (fQU.contains(prevChar) && !fPf.contains(prevChar)) { - setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×"); - continue; - } - - // LB 19a - // [^\p{ea=F}\p{ea=W}\p{ea=H}] × QU - if (!feaFWH.contains(prevChar) && fQU.contains(thisChar)) { - setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × QU"); - continue; - } - // × QU ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot ) - if (fQU.contains(thisChar)) { - if (nextPos < fText.length()) { - int nextChar = fText.codePointAt(nextPos); - if (!feaFWH.contains(nextChar)) { - setAppliedRule(pos, "LB 19a × QU [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); - continue; - } - } else { - setAppliedRule(pos, "LB 19 × QU eot"); - continue; - } - } - // QU × [^\p{ea=F}\p{ea=W}\p{ea=H}] - if (fQU.contains(prevChar) && !feaFWH.contains(thisChar)) { - setAppliedRule(pos, "LB 19a QU × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]"); - continue; - } - // ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) QU × - if (fQU.contains(prevChar)) { - if (prevPos == 0) { - setAppliedRule(pos, "LB 19a sot QU ×"); - continue; - } - // prevPosX2 is -1 if there was a break, and prevCharX2 is 0; but the UAX #14 rules can - // look through breaks. - int breakObliviousPrevPosX2 = moveIndex32(fText, prevPos, -1); - while (fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) { - if (breakObliviousPrevPosX2 == 0) { - break; - } - int beforeCM = moveIndex32(fText, breakObliviousPrevPosX2, -1); - if (fBK.contains(fText.codePointAt(beforeCM)) || - fCR.contains(fText.codePointAt(beforeCM)) || - fLF.contains(fText.codePointAt(beforeCM)) || - fNL.contains(fText.codePointAt(beforeCM)) || - fSP.contains(fText.codePointAt(beforeCM)) || - fZW.contains(fText.codePointAt(beforeCM))) { - break; - } - breakObliviousPrevPosX2 = beforeCM; - } - if (!feaFWH.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) { - setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] QU ×"); - continue; - } - } - - if (fCB.contains(thisChar) || fCB.contains(prevChar)) { - setAppliedRule(pos, "LB 20 Break around a CB"); - break; - } - - // Don't break between Hyphens and letters if a break or a space precedes the hyphen. - // Formerly this was a Finnish tailoring. - // (sot | BK | CR | LF | NL | SP | ZW | CB | GL) ( HY | [\u2010] ) × AL - if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar))) { - // sot ( HY | [\u2010] ) × AL. - if (prevPos == 0) { - setAppliedRule(pos, "LB 20a"); - continue; - } - // prevPosX2 is -1 if there was a break; but the UAX #14 rules can - // look through breaks. - int breakObliviousPrevPosX2 = moveIndex32(fText, prevPos, -1); - if (fBK.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fCR.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) { - setAppliedRule(pos, "LB 20a"); - continue; - } - while (breakObliviousPrevPosX2 > 0 && - fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) { - breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1); - } - if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) { - setAppliedRule(pos, "LB 20a"); - continue; - } - } - - if (fBA.contains(thisChar) || - fHY.contains(thisChar) || - fNS.contains(thisChar) || - fBB.contains(prevChar) ) { - setAppliedRule(pos, "LB 21"); - continue; - } - - if (fHL.contains(prevCharX2) && - (fHY.contains(prevChar) || - (fBA.contains(prevChar) && !feaFWH.contains(prevChar))) && - !fHL.contains(thisChar)) { - setAppliedRule(pos, "LB 21a HL (HY | BA) x [^HL]"); - continue; - } - - if (fSY.contains(prevChar) && fHL.contains(thisChar)) { - setAppliedRule(pos, "LB 21b SY x HL"); - continue; - } - - if (fIN.contains(thisChar)) { - setAppliedRule(pos, "LB 22"); - continue; - } - - // (AL | HL) x NU - // NU x (AL | HL) - if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) { - setAppliedRule(pos, "LB 23"); - continue; - } - if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { - setAppliedRule(pos, "LB 23"); - continue; - } - - // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. - // PR x (ID | EB | EM) - // (ID | EB | EM) x PO - if (fPR.contains(prevChar) && - (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) { - setAppliedRule(pos, "LB 23a"); - continue; - } - if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && - fPO.contains(thisChar)) { - setAppliedRule(pos, "LB 23a"); - continue; - } - - // Do not break between prefix and letters or ideographs. - // (PR | PO) x (AL | HL) - // (AL | HL) x (PR | PO) - if ((fPR.contains(prevChar) || fPO.contains(prevChar)) && - (fAL.contains(thisChar) || fHL.contains(thisChar))) { - setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs"); - continue; - } - if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && - (fPR.contains(thisChar) || fPO.contains(thisChar))) { - setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs"); - continue; - } - - boolean continueToNextPosition = false; - // LB 25. - for (XUnicodeSet[] pair : new XUnicodeSet[][]{ - new XUnicodeSet[]{fCL, fPO}, // 1. NU (SY | IS)* CL × PO - new XUnicodeSet[]{fCP, fPO}, // 2. NU (SY | IS)* CP × PO - new XUnicodeSet[]{fCL, fPR}, // 3. NU (SY | IS)* CL × PR - new XUnicodeSet[]{fCP, fPR}, // 4. NU (SY | IS)* CP × PR - }) { - XUnicodeSet left = pair[0]; - XUnicodeSet right = pair[1]; - if (left.contains(prevChar) && right.contains(thisChar)) { - // Check for the NU (SY | IS)* part. - boolean leftHandSideMatches = false; - tPos = moveIndex32(fText, prevPos, -1); - for (;;) { - while (tPos > 0 && fCM.contains(fText.codePointAt(tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - final int tChar = fText.codePointAt(tPos); - if (fSY.contains(tChar) || fIS.contains(tChar)) { - if (tPos == 0) { - leftHandSideMatches = false; - break; - } - tPos = moveIndex32(fText, tPos, -1); - } else if (fNU.contains(tChar)) { - leftHandSideMatches = true; - break; - } else { - leftHandSideMatches = false; - break; - } - } - if (leftHandSideMatches) { - setAppliedRule(pos, "LB 25/1..4"); - continueToNextPosition = true; - break; - } - } - } - if (continueToNextPosition) { - continue; - } - // 5. NU (SY | IS)* × PO - // 6. NU (SY | IS)* × PR - // 13. NU (SY | IS)* × NU - boolean leftHandSideMatches; - tPos = prevPos; - for (;;) { - while (tPos > 0 && fCM.contains(fText.codePointAt(tPos))) { - tPos = moveIndex32(fText, tPos, -1); - } - final int tChar = fText.codePointAt(tPos); - if (fSY.contains(tChar) || fIS.contains(tChar)) { - if (tPos == 0) { - leftHandSideMatches = false; - break; - } - tPos = moveIndex32(fText, tPos, -1); - } else if (fNU.contains(tChar)) { - leftHandSideMatches = true; - break; - } else { - leftHandSideMatches = false; - break; - } - } - if (leftHandSideMatches && - (fPO.contains(thisChar) || fPR.contains(thisChar) || fNU.contains(thisChar))) { - setAppliedRule(pos, "LB 25/5,6,13,14"); - continue; - } - if (nextPos < fText.length()) { - final int nextChar = fText.codePointAt(nextPos); - // 7. PO × OP NU - if (fPO.contains(prevChar) && fOP.contains(thisChar) && fNU.contains(nextChar)) { - setAppliedRule(pos, "LB 25/7"); - continue; - } - // 9. PR × OP NU - if (fPR.contains(prevChar) && fOP.contains(thisChar) && fNU.contains(nextChar)) { - setAppliedRule(pos, "LB 25/9"); - continue; - } - int nextPosX2 = moveIndex32(fText, nextPos, 1); - while (nextPosX2 < fText.length() && fCM.contains(fText.codePointAt(nextPosX2))) { - nextPosX2 = moveIndex32(fText, nextPosX2, 1); - } - - if (nextPosX2 < fText.length()) { - final int nextCharX2 = fText.codePointAt(nextPosX2); - // 7bis. PO × OP IS NU - if (fPO.contains(prevChar) && fOP.contains(thisChar) && fIS.contains(nextChar) && - fNU.contains(nextCharX2)) { - setAppliedRule(pos, "LB 25/7bis"); - continue; - } - // 9bis. PR × OP IS NU - if (fPR.contains(prevChar) && fOP.contains(thisChar) && fIS.contains(nextChar) && - fNU.contains(nextCharX2)) { - setAppliedRule(pos, "LB 25/9bis"); - continue; - } - } - } - for (XUnicodeSet[] pair : new XUnicodeSet[][]{ - new XUnicodeSet[]{fPO, fNU}, // 8. PO × NU - new XUnicodeSet[]{fPR, fNU}, // 10. PR × NU - new XUnicodeSet[]{fHY, fNU}, // 11. HY × NU - new XUnicodeSet[]{fIS, fNU}, // 12. IS × NU - }) { - XUnicodeSet left = pair[0]; - XUnicodeSet right = pair[1]; - if (left.contains(prevChar) && right.contains(thisChar)) { - continueToNextPosition = true; - break; - } - } - if (continueToNextPosition) { - continue; - } - - if (fJL.contains(prevChar) && (fJL.contains(thisChar) || - fJV.contains(thisChar) || - fH2.contains(thisChar) || - fH3.contains(thisChar))) { - setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); - continue; - } - - if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && - (fJV.contains(thisChar) || fJT.contains(thisChar))) { - setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); - continue; - } - - if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && - fJT.contains(thisChar)) { - setAppliedRule(pos, "LB 26 Do not break a Korean syllable."); - continue; - } - - if ((fJL.contains(prevChar) || fJV.contains(prevChar) || - fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && - fPO.contains(thisChar)) { - setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); - continue; - } - if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || - fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { - setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID."); - continue; - } - - - - if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { - setAppliedRule(pos, "LB 28 Do not break between alphabetics"); - continue; - } - - if (fAP.contains(prevChar) && - (fAK.contains(thisChar) || thisChar == '◌' || fAS.contains(thisChar))) { - setAppliedRule(pos, "LB 28a.1 AP x (AK | ◌ | AS)"); - continue; - } - - if ((fAK.contains(prevChar) || prevChar == '◌' || fAS.contains(prevChar)) && - (fVF.contains(thisChar) || fVI.contains(thisChar))) { - setAppliedRule(pos, "LB 28a.2 (AK | ◌ | AS) x (VF | VI)"); - continue; - } - - if ((fAK.contains(prevCharX2) || prevCharX2 == '◌' || fAS.contains(prevCharX2)) && - fVI.contains(prevChar) && - (fAK.contains(thisChar) || thisChar == '◌')) { - setAppliedRule(pos, "LB 28a.3 (AK | ◌ | AS) VI x (AK | ◌)"); - continue; - } - - if (nextPos < fText.length()) { - // note: UnicodeString::char32At(length) returns ffff, not distinguishable - // from a legit ffff noncharacter. So test length separately. - int nextChar = UTF16.charAt(fText, nextPos); - if ((fAK.contains(prevChar) || prevChar == '◌' || fAS.contains(prevChar)) && - (fAK.contains(thisChar) || thisChar == '◌' || fAS.contains(thisChar)) && - fVF.contains(nextChar)) { - setAppliedRule(pos, "LB 28a.4 (AK | ◌ | AS) x (AK | ◌ | AS) VF"); - continue; - } - } - - if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { - setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics"); - continue; - } - - // (AL | NU) x OP - // CP x (AL | NU) - if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && - fOP30.contains(thisChar)) { - setAppliedRule(pos, "LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation."); - continue; - } - if (fCP30.contains(prevChar) && - (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { - setAppliedRule(pos, "LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation."); - continue; - } - - // RI RI ÷ RI - // RI x RI - if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { - setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators."); - break; - } - if (fRI.contains(prevChar) && fRI.contains(thisChar)) { - // Two Regional Indicators have been paired. - // Over-write the trailing one (thisChar) to prevent it from forming another pair with a - // following RI. This is a hack. - thisChar = -1; - setAppliedRule(pos, "LB 30a Break between pairs of Regional Indicators."); - continue; - } - - // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier. - if (fEB.contains(prevChar) && fEM.contains(thisChar)) { - setAppliedRule(pos, "LB 30b Emoji Base x Emoji Modifier"); - continue; - } - - if (fExtPictUnassigned.contains(prevChar) && fEM.contains(thisChar)) { - setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] × EM"); - continue; + for (final SegmentationRule rule : rules) { + rule.apply(remapped, resolved); + } + for (int i = 0; i < resolved.length; ++i) { + if (resolved[i].appliedRule == null) { + throw new IllegalArgumentException("Failed to resolve at " + i); } - - // LB 31 Break everywhere else - setAppliedRule(pos, "LB 31 Break everywhere else"); - break; + setAppliedRule(i, resolved[i].appliedRule.name()); } - - return pos; } - - - // Match the following regular expression in the input text. - // ((PR | PO) CM*)? ((OP | HY) CM*)? (IS CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? - // 0 0 1 4 4 4 5 5 7 7 7 7 9 9 9 11 11 (match states) - // retVals array [0] index of the start of the match, or -1 if no match - // [1] index of first char following the match. - // Can not use Java regex because need supplementary character support, - // and because Unicode char properties version must be the same as in - // the version of ICU being tested. - private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { - if (retVals == null) { - retVals = new int[2]; - } - retVals[0] = -1; // Indicates no match. - int matchState = 0; - int idx = startIdx; - - matchLoop: for (idx = startIdx; idx= 7) { - retVals[0] = startIdx; - retVals[1] = idx; - } - return retVals; + return -1; } - @Override List charClasses() { return fSets; @@ -2136,6 +1283,9 @@ static int nextCP(StringBuffer s, int i) { } + + + /** * random number generator. Not using Java's built-in Randoms for two reasons: * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. @@ -2151,6 +1301,9 @@ private static int m_rand() return (m_seed >>> 16) % 32768; } + private final static String[] monkeys = new String[] { + "🙈", "🙉", "🙊", "🐵", "🐒"}; + // Helper function for formatting error output. // Append a string into a fixed-size field in a StringBuffer. // Blank-pad the string if it is shorter than the field. @@ -2214,6 +1367,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; int i; int loopCount = 0; + int errorCount = 0; boolean printTestData = false; boolean printBreaksFromBI = false; @@ -2253,16 +1407,13 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int // For minimizing width of class name output. int classNameSize = mk.maxClassNameSize(); - - int dotsOnLine = 0; while (loopCount < numIterations || numIterations == -1) { if (numIterations == -1 && loopCount % 10 == 0) { // If test is running in an infinite loop, display a periodic tic so // we can tell that it is making progress. - System.out.print("."); - if (dotsOnLine++ >= 80){ - System.out.println(); - dotsOnLine = 0; + System.out.print(monkeys[m_rand() % monkeys.length]); + if (loopCount % 1000_000_000 == 0) { + System.out.println("\nTested " + loopCount / 1000_000_000 + " million random strings with " + errorCount + " errors"); } } // Save current random number seed, so that we can recreate the random numbers @@ -2423,6 +1574,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int } if (errorType != null) { + ++errorCount; // Format a range of the test text that includes the failure as // a data item that can be included in the rbbi test data file. @@ -2465,12 +1617,14 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int buffer.append("\n") .append((expectedBreaks[i] ? "Break expected but not found." : "Break found but not expected.")) .append( - String.format(" at index %d. Parameters to reproduce: @\"type=%s seed=%d loop=1\"\n", + String.format(" at index %d. Parameters to reproduce: -Dtest=RBBITestMonkey#Test%sMonkey -Dseed=%d -Dloop=1\n", i, name, seed)); int c; // Char from test data for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { - + if (ci == testText.length()) { + break; // TODO(egg): The index dance above seems wrong. + } c = testText.codePointAt(ci); buffer.append((ci == i) ? " --→" : " ") .append(String.format(" %3d : ", ci)) @@ -2514,7 +1668,7 @@ public void TestCharMonkey() { RBBICharMonkey m = new RBBICharMonkey(); BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); - RunMonkey(bi, m, "char", seed, loopCount); + RunMonkey(bi, m, "Char", seed, loopCount); } @Test @@ -2525,7 +1679,7 @@ public void TestWordMonkey() { logln("Word Break Monkey Test"); RBBIWordMonkey m = new RBBIWordMonkey(); BreakIterator bi = BreakIterator.getWordInstance(Locale.US); - RunMonkey(bi, m, "word", seed, loopCount); + RunMonkey(bi, m, "Word", seed, loopCount); } @Test @@ -2537,7 +1691,7 @@ public void TestLineMonkey() { RBBILineMonkey m = new RBBILineMonkey(); BreakIterator bi = BreakIterator.getLineInstance(Locale.US); try { - RunMonkey(bi, m, "line", seed, loopCount); + RunMonkey(bi, m, "Line", seed, loopCount); } catch(IllegalArgumentException e) { if (e.getMessage().equals("Invalid code point U+-000001")) { // Looks like you used class UnicodeSet instead of class XUnicodeSet @@ -2558,7 +1712,7 @@ public void TestSentMonkey() { logln("Sentence Break Monkey Test"); RBBISentenceMonkey m = new RBBISentenceMonkey(); BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); - RunMonkey(bi, m, "sent", seed, loopCount); + RunMonkey(bi, m, "Sent", seed, loopCount); } // // Round-trip monkey tests. @@ -2579,7 +1733,7 @@ public void TestRTCharMonkey() { BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); String rules = bi.toString(); BreakIterator rtbi = new RuleBasedBreakIterator(rules); - RunMonkey(rtbi, m, "char", seed, loopCount); + RunMonkey(rtbi, m, "RTChar", seed, loopCount); } @Test @@ -2592,7 +1746,7 @@ public void TestRTWordMonkey() { BreakIterator bi = BreakIterator.getWordInstance(Locale.US); String rules = bi.toString(); BreakIterator rtbi = new RuleBasedBreakIterator(rules); - RunMonkey(rtbi, m, "word", seed, loopCount); + RunMonkey(rtbi, m, "RTWord", seed, loopCount); } @Test @@ -2606,7 +1760,7 @@ public void TestRTLineMonkey() { String rules = bi.toString(); BreakIterator rtbi = new RuleBasedBreakIterator(rules); try { - RunMonkey(rtbi, m, "line", seed, loopCount); + RunMonkey(rtbi, m, "RTLine", seed, loopCount); } catch(IllegalArgumentException e) { if (e.getMessage().equals("Invalid code point U+-000001")) { // Looks like you used class UnicodeSet instead of class XUnicodeSet @@ -2629,6 +1783,6 @@ public void TestRTSentMonkey() { BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); String rules = bi.toString(); BreakIterator rtbi = new RuleBasedBreakIterator(rules); - RunMonkey(rtbi, m, "sent", seed, loopCount); + RunMonkey(rtbi, m, "RTSent", seed, loopCount); } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RegexRule.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RegexRule.java new file mode 100644 index 000000000000..cbe59071f08b --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RegexRule.java @@ -0,0 +1,109 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +package com.ibm.icu.dev.test.rbbi; + +import java.util.Arrays; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A regex rule expressed as in UAXes #14 and #29. + * + * The rule consists of two regexes for context before and after a position in + * the remapped text, + * and of a resolution (break or not) that applies to the corresponding position + * in the original + * string if both match. + */ +class RegexRule extends SegmentationRule { + RegexRule(String name, String before, Resolution resolution, + String after) { + super(name); + resolution_ = resolution; + before_ = Pattern.compile(expandUnicodeSets(before), Pattern.COMMENTS | Pattern.DOTALL); + endsWithBefore_ = Pattern.compile( + ".*(" + expandUnicodeSets(before) + ")", Pattern.COMMENTS | Pattern.DOTALL); + after_ = Pattern.compile(expandUnicodeSets(after), Pattern.COMMENTS | Pattern.DOTALL); + } + + @Override + void apply(StringBuilder remapped, BreakContext[] resolved) { + // The unicodetools implementation simply tries, for each index, to + // match the string up to the index against /.*(before)/ (with + // `matches`) and the beginning of the string after the index against + // /after/ (with `lookingAt`), but that is very slow, especially for + // nonempty /before/. While the old monkeys are not a production + // implementation, we still do not want them to be too slow, since we + // need to test millions of sample strings. Instead we search for + // /before/ and /after/, and check resulting candidates. This speeds + // things up by a factor of ~40. + // We need to be careful about greedy matching: The first position where + // the rule matches may be before the end of the first /before/ match. + // However, it is both: + // 1. within a /before/ match or at its bounds, + // 2. at the beginning of an /after/ match. + // Further, the /before/ context of the rule matches within the + // aforementioned /before/ match. Note that we need to look for + // overlapping matches, thus calls to `find` are always preceded by a + // reset via `region`. + final Matcher beforeSearch = before_.matcher(remapped); + final Matcher afterSearch = after_.matcher(remapped); + beforeSearch.useAnchoringBounds(false); + afterSearch.useAnchoringBounds(false); + if (beforeSearch.find() && afterSearch.find()) { + for (;;) { + if (afterSearch.start() < beforeSearch.start()) { + afterSearch.region(beforeSearch.start(), remapped.length()); + if (!afterSearch.find()) { + break; + } + } else if (afterSearch.start() > beforeSearch.end()) { + if (beforeSearch.start() == remapped.length()) { + break; + } + beforeSearch.region(remapped.offsetByCodePoints(beforeSearch.start(), 1), + remapped.length()); + if (!beforeSearch.find()) { + break; + } + } else { + final Optional position = Arrays.stream(resolved) + .filter(r -> r.indexInRemapped != null && r.indexInRemapped == afterSearch.start()) + .findFirst(); + if (!position.isPresent()) { + throw new IllegalArgumentException(("Rule " + name() + + " found a break at a position which does not correspond to an index in " + + "the original string")); + } + if (position.get().appliedRule == null && + endsWithBefore_.matcher(remapped) + .useAnchoringBounds(false) + .region(beforeSearch.start(), afterSearch.start()) + .matches()) { + position.get().appliedRule = this; + } + if (afterSearch.start() == remapped.length()) { + break; + } + afterSearch.region(remapped.offsetByCodePoints(afterSearch.start(), 1), + remapped.length()); + if (!afterSearch.find()) { + break; + } + } + } + } + } + + @Override + Resolution resolution() { + return resolution_; + } + + private final Pattern before_; + private final Pattern endsWithBefore_; + private final Pattern after_; + private final Resolution resolution_; +} \ No newline at end of file diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RemapRule.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RemapRule.java new file mode 100644 index 000000000000..e4bc8e79913b --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RemapRule.java @@ -0,0 +1,167 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +package com.ibm.icu.dev.test.rbbi; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A segmentation rule expressed as in UAXes #14 and #29. + * + * A remap rule performs normal a regex replacement applied to the remapped + * string. + * This replacement may use capturing groups. Any positions in the original + * string that correspond + * to positions within the replaced text are resolved to NO_BREAK by this rule. + */ +public class RemapRule extends SegmentationRule { + RemapRule(String name, String pattern, String replacement) { + super(name); + replacement_ = replacement; + pattern_ = Pattern.compile(expandUnicodeSets(pattern), Pattern.COMMENTS | Pattern.DOTALL); + } + + @Override + void apply(StringBuilder remapped, BreakContext[] resolved) { + // This one has to be a StringBuffer rather than a StringBuilder because the + // overload of + // AppendReplacement that takes a StringBuilder is new in Java 9. + StringBuffer result = new StringBuffer(); + int i = 0; + int offset = 0; + // We find all matches of the `pattern_` and replace them according to + // the `replacement_`, producing the new remapped string `result`. + // For every position i in the original string, + // `resolved[i].indexInRemapped` is null if i lies within a replaced + // match, and is set to the new index in `result` otherwise, by adding + // the accumulated difference `offset` between match lengths and + // replacement lengths. + // Consider a 4-codepoint, 6 code unit string s = ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩, where + // ␠ stands for U+0020 and U+12000 𒀀 and U+1D172 ◌𝅲 each require two code + // units, and apply the following two rules: + // 1. (?\P{lb=SP}) \p{lb=CM}* → ${X} + // 2. \p{lb=CM} → A + // The string remapped and the indexInRemapped values change as follows: + // indexInRemapped remapped string rule final + // (aligned on the initial string) applied offset + // 𒀀 ◌́ ␠ ◌𝅲 + // 0 1 2 3 4 5 6 ⟨ 𒀀, ◌́, ␠, ◌𝅲 ⟩ (none) + // 0 - - 2 3 4 5 ⟨ 𒀀, ␠, ◌𝅲 ⟩ 1 -1 + // 0 - - 2 3 - 4 ⟨ 𒀀, ␠, A ⟩ 2 -1 + // + // Note that the last indexInRemapped is always equal to the length of + // the remapped string. + final Matcher matcher = pattern_.matcher(remapped); + while (matcher.find()) { + for (;; ++i) { + if (resolved[i].indexInRemapped == null) { + continue; + } + if (resolved[i].indexInRemapped != null && + resolved[i].indexInRemapped > matcher.start()) { + break; + } + resolved[i].indexInRemapped += offset; + } + for (;; ++i) { + if (resolved[i].indexInRemapped == null) { + continue; + } + // Note that + // `*resolved[i].indexInRemapped > matcher.end()` should + // never happen with ordinary rules, but could in principle + // happen with rules that remap to code point sequences, e.g., + // 1. BC → TYZ + // 2. AT → X + // applied to ⟨ A, B, C ⟩: + // indexInRemapped remapped rule + // A B C + // 0 1 2 3 ⟨ A, B, C ⟩ (none) + // 0 1 - 4 ⟨ A, T, Y, Z ⟩ 1 + // 0 - - 3 ⟨ X, Y, Z ⟩ 2 + // Where for the application of rule 2, the match ends at + // position 2 in remapped, which does not correspond to a + // position in the original string. + if (resolved[i].indexInRemapped != null && + resolved[i].indexInRemapped >= matcher.end()) { + break; + } + if (resolved[i].appliedRule != null && + resolved[i].appliedRule.resolution() == Resolution.BREAK) { + throw new IllegalArgumentException( + "Replacement rule at remapped indices " + + matcher.start() + + " sqq. spans a break"); + } + resolved[i].appliedRule = this; + resolved[i].indexInRemapped = null; + } + // While replacing, we need to check that we are not creating + // surrogate pairs. Since appendReplacement performs two + // concatenations (the unreplaced segment and the replacement), we + // need to check in two places: whether the unreplaced segment + // starts with a trailing surrogate that ends up after a leading + // surrogate, and whether the replaced segment starts with a leading + // surrogate that ends up after a trailing surrogate. + // We break the pair by replacing one of the surrogates with U+FFFF, + // which has the same properties for all but line breaking, and the + // same behaviour in line breaking (lb=SG and lb=XX are both treated + // as lb=AL). + Integer trailingLead = null; + if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) { + trailingLead = result.length() - 1; + } + + matcher.appendReplacement(result, replacement_); + + if (trailingLead != null && trailingLead + 1 < result.length() && + Character.isLowSurrogate(result.charAt(trailingLead + 1))) { + result.setCharAt(trailingLead, '\uFFFF'); + } + + if (matcher.start() + offset > 0 && + Character.isHighSurrogate(result.charAt(matcher.start() + offset - 1)) && + Character.isLowSurrogate(result.charAt(matcher.start() + offset))) { + result.setCharAt(matcher.start() + offset, '\uFFFF'); + } + offset = result.length() - resolved[i].indexInRemapped; + } + for (; i < resolved.length; ++i) { + if (resolved[i].indexInRemapped == null) { + continue; + } + resolved[i].indexInRemapped += offset; + } + + Integer trailingLead = null; + if (result.length() > 0 && Character.isHighSurrogate(result.charAt(result.length() - 1))) { + trailingLead = result.length() - 1; + } + matcher.appendTail(result); + if (trailingLead != null && trailingLead + 1 < result.length() && + Character.isLowSurrogate(result.charAt(trailingLead + 1))) { + result.setCharAt(trailingLead, '\uFFFF'); + } + + if (resolved[resolved.length - 1].indexInRemapped != result.length()) { + StringBuilder indices = new StringBuilder(); + for (final BreakContext r : resolved) { + indices.append(r.indexInRemapped == null ? "null" : r.indexInRemapped.toString()); + indices.append(","); + } + throw new IllegalArgumentException("Inconsistent indexInRemapped " + indices + " for new remapped string " + + result); + } + remapped.setLength(0); + remapped.append(result); + } + + @Override + Resolution resolution() { + return Resolution.NO_BREAK; + } + + private final Pattern pattern_; + private final String replacement_; +} \ No newline at end of file diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/SegmentationRule.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/SegmentationRule.java new file mode 100644 index 000000000000..e7abdbe7e301 --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/SegmentationRule.java @@ -0,0 +1,88 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +package com.ibm.icu.dev.test.rbbi; + +import java.text.ParsePosition; + +import javax.swing.RowFilter.Entry; + +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; +import com.ibm.icu.text.UTF16; + +/** + * A segmentation rule expressed as in UAXes #14 and #29. + * + * Rules are applied sequentially. + * Rules operate on a mutable remapped string (which the caller should initially + * set to the string + * to be segmented), and can resolve positions in the original string to either + * BREAK or NO_BREAK. + */ +public abstract class SegmentationRule { + enum Resolution { + BREAK, + NO_BREAK, + } + + static class BreakContext { + BreakContext(int index) { + indexInRemapped = index; + } + + Integer indexInRemapped; + SegmentationRule appliedRule = null; + }; + + SegmentationRule(String name) { + name_ = name; + } + + // Returns "\\uhhhh" for a BMP code point and "\\uDhhh\\uDhhh" (UTF-16) for other code points. + private String javaUEscape(int codePoint) { + if (codePoint <= 0xFFFF) { + return "\\u" + Utility.hex(codePoint); + } else { + return "\\u" + Utility.hex(UTF16.getLeadSurrogate(codePoint)) + "\\u" + + Utility.hex(UTF16.getTrailSurrogate(codePoint)); + } + } + + protected String expandUnicodeSets(String regex) { + StringBuilder result = new StringBuilder(); + int i = 0; + while (i < regex.length()) { + if (regex.charAt(i) == '[' || regex.charAt(i) == '\\') { + ParsePosition pp = new ParsePosition(i); + final UnicodeSet set = new UnicodeSet(regex, pp, null); + // Escape everything. We could use _generatePattern, but then we would have to + // convert \U escapes to sequences of \‌u escapes, and to escape # ourselves. + result.append('['); + for (EntryRange range : set.ranges()) { + result.append(javaUEscape(range.codepoint)); + if (range.codepointEnd != range.codepoint) { + result.append('-'); + result.append(javaUEscape(range.codepointEnd)); + } + } + result.append(']'); + i = pp.getIndex(); + } else { + result.append(regex.charAt(i++)); + } + } + return result.toString(); + } + + abstract void apply(StringBuilder remapped, BreakContext[] resolved); + + abstract Resolution resolution(); + + String name() { + return name_; + } + + private final String name_; +} diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt index 9f85b7917139..e2154abf6309 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt @@ -176,7 +176,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt index 7aad76ecf107..bb0a6880ea29 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt @@ -180,7 +180,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 72e7563c9274..f9152060bf2d 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -181,7 +181,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 99d01874d1fb..b04236532bbd 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -200,7 +200,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA BAX HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index 211298539797..c7c518d5b68b 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -182,7 +182,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index 2061f9170848..cfa9c7968e1b 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -186,7 +186,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 1c7fe9975699..781ce068be7b 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -2214,3 +2214,7 @@ Bangkok)• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. +# A hyphen following non-breaking space that carries an intervening combining +# mark is treated as word-initial; by LB20a it has no break opportunity after +# it. A bug in ICU 76 incorrectly handled that case (ICU-22986). +• ̄-k• \ No newline at end of file