Merge pull request #941 from spencermountain/dev

Dev
spencermountain · Jul 29, 2022 · 98282ff · 98282ff
2 parents 6a58ff8 + 88fb943
commit 98282ff
Show file tree

Hide file tree

Showing 14 changed files with 47 additions and 17 deletions.
diff --git a/builds/compromise.js b/builds/compromise.js
diff --git a/builds/one/compromise-one.cjs b/builds/one/compromise-one.cjs
diff --git a/builds/one/compromise-one.mjs b/builds/one/compromise-one.mjs
diff --git a/builds/three/compromise-three.cjs b/builds/three/compromise-three.cjs
diff --git a/builds/three/compromise-three.mjs b/builds/three/compromise-three.mjs
diff --git a/builds/two/compromise-two.cjs b/builds/two/compromise-two.cjs
diff --git a/builds/two/compromise-two.mjs b/builds/two/compromise-two.mjs
diff --git a/changelog.md b/changelog.md
@@ -8,6 +8,9 @@ While all _Major_ releases should be reviewed, our only _large_ releases are **v
 
 <!-- #### 14.5.0 [Unreleased]
 -->
+#### 14.4.2 [July 2022]
+- **[fix]** - hotfix for sentence tokenization issue #935
+
 #### 14.4.1 [July 2022]
 - **[change]** - improvements to negative-optional match logic - `!foo?`
 - **[change]** - support short sentences embedded in quotes+parentheses

diff --git a/package.json b/package.json
@@ -2,7 +2,7 @@
   "author": "Spencer Kelly <[email protected]> (http://spencermounta.in)",
   "name": "compromise",
   "description": "modest natural language processing",
-  "version": "14.4.1",
+  "version": "14.4.2",
   "main": "./src/three.js",
   "unpkg": "./builds/compromise.js",
   "type": "module",
@@ -114,4 +114,4 @@
     "_tests/**"
   ],
   "license": "MIT"
-}
+}
diff --git a/scratch.js b/scratch.js
@@ -9,9 +9,9 @@ let txt = ''
 let doc
 let m
 
+// doc = nlp('Those are Great Danes')
+// doc.nouns(0).toSingular()
+// console.log(doc.text())
 
-let arr = []
-
-doc = nlp("she is cool").sentences()
-doc.toFutureTense()
-console.log(doc.text())
+doc = nlp(`The hero was stunned by the scary monster. The glowing girl said (Hey! Leave him alone!).`)
+doc.debug()
diff --git a/src/1-one/tokenize/methods/01-sentences/04-quote-merge.js b/src/1-one/tokenize/methods/01-sentences/04-quote-merge.js
@@ -50,6 +50,7 @@ const quoteMerge = function (splits) {
       // look at the next sentence for a closing quote,
       if (closesQuote(splits[i + 1]) && splits[i + 1].length < MAX_QUOTE) {
         splits[i] += splits[i + 1]// merge them
+        arr.push(splits[i])
         splits[i + 1] = ''
         i += 1
         continue
@@ -60,6 +61,7 @@ const quoteMerge = function (splits) {
         //make sure it's not too-long
         if (toAdd.length < MAX_QUOTE) {
           splits[i] += toAdd
+          arr.push(splits[i])
           splits[i + 1] = ''
           splits[i + 2] = ''
           i += 2

diff --git a/src/1-one/tokenize/methods/01-sentences/05-parens-merge.js b/src/1-one/tokenize/methods/01-sentences/05-parens-merge.js
@@ -16,6 +16,7 @@ const mergeParens = function (splits) {
         if (m2 !== null && m.length === 1 && !hasOpen.test(splits[i + 1])) {
           // merge in 2nd sentence
           splits[i] += splits[i + 1]
+          arr.push(splits[i])
           splits[i + 1] = ''
           i += 1
           continue

diff --git a/src/_version.js b/src/_version.js
@@ -1 +1 @@
-export default '14.4.1'
+export default '14.4.2'
diff --git a/tests/two/misc/tokenize.test.js → tests/one/tokenize/sentence-split.test.js b/tests/two/misc/tokenize.test.js → tests/one/tokenize/sentence-split.test.js
@@ -1,6 +1,6 @@
 import test from 'tape'
-import nlp from '../_lib.js'
-const here = '[two/tokenize] '
+import nlp from '../../two/_lib.js'
+const here = '[one/sentence-split] '
 
 
 test('sentence tokenizer', function (t) {
@@ -24,6 +24,9 @@ test('sentence tokenizer', function (t) {
     // [`it fell out of the bag. (I wasn't fast enough.) Now it's on the floor.`, 3],
     [`the scent of basil (my favorite).`, 1],
     [`Your whole life (right? right?) might go smoothly this year.`, 1],
+    [`before. (inside word) and (inside). after`, 3],
+    [`before. (inside word?) and (inside!). after`, 3],
+    [`before. (the whole thing is inside). after`, 3],
     // quotation wrapper
     [`the doc said "no sir" and walked away. the end`, 2],
     [`Kendal asked, “What time is it?”`, 1],
@@ -37,6 +40,9 @@ test('sentence tokenizer', function (t) {
     // mis-matched examples
     ['i thought "no way! and he said "yes way".', 2],//
     ['i thought (no way! and he said (yes)', 2],//
+    ['i thought (no way! and he said yes', 2],
+    ['(no way! and he said yes', 2],
+    ['"no way! and he\'s cool', 2],
   ]
   arr.forEach(a => {
     let [str, len] = a
@@ -83,3 +89,21 @@ test('emoji-only sentence', function (t) {
   t.equal(doc.length, 2, here + 'boemojith sentence')
   t.end()
 })
+
+test('nested quotes', function (t) {
+  let doc = nlp(`The hero was stunned by the scary monster. The glowing girl said "Hey! Leave him alone!".`)
+  t.equal(doc.length, 2, here + 'nested quote sentence')
+
+  doc = nlp(`foo bar. Before "quote here" and "quote here".`)
+  t.equal(doc.length, 2, here + '2 quote sentence')
+
+  doc = nlp(`foo bar. Before "quote here?" and "quote here?".`)
+  t.equal(doc.length, 2, here + '2 quotes with sentence')
+
+  doc = nlp(`Foo bar. Before "quote here? and quote here?". After`)
+  t.equal(doc.length, 3, here + '1 quotes with 2 sentences')
+
+  doc = nlp(`Foo bar. Before "quote here? and quote here? also here!". After`)
+  t.equal(doc.length, 3, here + '1 quotes with 3 sentences')
+  t.end()
+})