From 9f72206c5cb9e01bac6717431f0edc328a7778ba Mon Sep 17 00:00:00 2001
From: sirfz <iamfayez@gmail.com>
Date: Fri, 30 Nov 2018 17:28:24 +0200
Subject: [PATCH] support BlanksBeforeWord API (#158)

---
 tesseract.pxd     | 66 ++++++++++++++++++++++++++++++++---------------
 tesserocr.pyx     |  8 +++++-
 tests/test_api.py | 59 ++++++++++++++++++++++++++++--------------
 tox.ini           |  1 +
 4 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/tesseract.pxd b/tesseract.pxd
index 9431494..0169ff5 100644
--- a/tesseract.pxd
+++ b/tesseract.pxd
@@ -142,27 +142,51 @@ cdef extern from "tesseract/pageiterator.h" namespace "tesseract" nogil:
         void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int *) const
 
 cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil:
-    cdef cppclass LTRResultIterator(PageIterator):
-        char *GetUTF8Text(PageIteratorLevel) const
-        void SetLineSeparator(cchar_t *)
-        void SetParagraphSeparator(cchar_t *)
-        float Confidence(PageIteratorLevel) const
-        cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const
-        cchar_t *WordRecognitionLanguage() const
-        StrongScriptDirection WordDirection() const
-        bool WordIsFromDictionary() const
-        bool WordIsNumeric() const
-        bool HasBlamerInfo() const
-        cchar_t *GetBlamerDebug() const
-        cchar_t *GetBlamerMisadaptionDebug() const
-        bool HasTruthString() const
-        bool EquivalentToTruth(cchar_t *) const
-        char *WordTruthUTF8Text() const
-        char *WordNormedUTF8Text() const
-        cchar_t *WordLattice(int *) const
-        bool SymbolIsSuperscript() const
-        bool SymbolIsSubscript() const
-        bool SymbolIsDropcap() const
+    IF TESSERACT_VERSION >= 0x4000000:
+        cdef cppclass LTRResultIterator(PageIterator):
+            char *GetUTF8Text(PageIteratorLevel) const
+            void SetLineSeparator(cchar_t *)
+            void SetParagraphSeparator(cchar_t *)
+            float Confidence(PageIteratorLevel) const
+            cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const
+            cchar_t *WordRecognitionLanguage() const
+            StrongScriptDirection WordDirection() const
+            bool WordIsFromDictionary() const
+            int BlanksBeforeWord() const
+            bool WordIsNumeric() const
+            bool HasBlamerInfo() const
+            cchar_t *GetBlamerDebug() const
+            cchar_t *GetBlamerMisadaptionDebug() const
+            bool HasTruthString() const
+            bool EquivalentToTruth(cchar_t *) const
+            char *WordTruthUTF8Text() const
+            char *WordNormedUTF8Text() const
+            cchar_t *WordLattice(int *) const
+            bool SymbolIsSuperscript() const
+            bool SymbolIsSubscript() const
+            bool SymbolIsDropcap() const
+    ELSE:
+        cdef cppclass LTRResultIterator(PageIterator):
+            char *GetUTF8Text(PageIteratorLevel) const
+            void SetLineSeparator(cchar_t *)
+            void SetParagraphSeparator(cchar_t *)
+            float Confidence(PageIteratorLevel) const
+            cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const
+            cchar_t *WordRecognitionLanguage() const
+            StrongScriptDirection WordDirection() const
+            bool WordIsFromDictionary() const
+            bool WordIsNumeric() const
+            bool HasBlamerInfo() const
+            cchar_t *GetBlamerDebug() const
+            cchar_t *GetBlamerMisadaptionDebug() const
+            bool HasTruthString() const
+            bool EquivalentToTruth(cchar_t *) const
+            char *WordTruthUTF8Text() const
+            char *WordNormedUTF8Text() const
+            cchar_t *WordLattice(int *) const
+            bool SymbolIsSuperscript() const
+            bool SymbolIsSubscript() const
+            bool SymbolIsDropcap() const
 
     cdef cppclass ChoiceIterator:
         ChoiceIterator(const LTRResultIterator &) except +
diff --git a/tesserocr.pyx b/tesserocr.pyx
index 03fac00..637e63d 100644
--- a/tesserocr.pyx
+++ b/tesserocr.pyx
@@ -18,7 +18,7 @@ tesseract 3.04.00
  ['eng', 'osd', 'equ'])
 """
 
-__version__ = '2.3.1'
+__version__ = '2.4.0'
 
 import os
 from io import BytesIO
@@ -896,6 +896,12 @@ cdef class PyLTRResultIterator(PyPageIterator):
         """Return True if the current word was found in a dictionary."""
         return self._ltrriter.WordIsFromDictionary()
 
+    IF TESSERACT_VERSION >= 0x4000000:
+        def BlanksBeforeWord(self):
+            """Return True if the current word is numeric."""
+            return self._ltrriter.BlanksBeforeWord()
+
+
     def WordIsNumeric(self):
         """Return True if the current word is numeric."""
         return self._ltrriter.WordIsNumeric()
diff --git a/tests/test_api.py b/tests/test_api.py
index 73832f3..55319bc 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -177,27 +177,46 @@ def test_word_confidences(self):
         mapped_confidences = self._api.MapWordConfidences()
         self.assertEqual([v[0] for v in mapped_confidences], words)
         self.assertEqual([v[1] for v in mapped_confidences], confidences)
-        
+
+    @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
     def test_LSTM_choices(self):
-        if _TESSERACT_VERSION >= 0x4000000:
-            """Test GetBestLSTMSymbolChoices."""
-            self._api.SetVariable("lstm_choice_mode", "2")
-            self._api.SetImageFile(self._image_file)
-            self._api.Recognize()
-            LSTM_choices = self._api.GetBestLSTMSymbolChoices()
-            words = self._api.AllWords()
-            self.assertEqual(len(words), len(LSTM_choices))
-            
-            for choice, word in zip(LSTM_choices, words):
-                chosen_word = ""
-                for timestep in choice:
-                    for alternative in timestep:
-                        self.assertGreaterEqual(alternative[1], 0.0)
-                        self.assertLessEqual(alternative[1], 2.0)
-                    chosen_symbol = timestep[0][0]
-                    if chosen_symbol != " ":
-                        chosen_word += chosen_symbol
-                self.assertEqual(chosen_word, word)
+        """Test GetBestLSTMSymbolChoices."""
+        self._api.SetVariable("lstm_choice_mode", "2")
+        self._api.SetImageFile(self._image_file)
+        self._api.Recognize()
+        LSTM_choices = self._api.GetBestLSTMSymbolChoices()
+        words = self._api.AllWords()
+        self.assertEqual(len(words), len(LSTM_choices))
+
+        for choice, word in zip(LSTM_choices, words):
+            chosen_word = ""
+            for timestep in choice:
+                for alternative in timestep:
+                    self.assertGreaterEqual(alternative[1], 0.0)
+                    self.assertLessEqual(alternative[1], 2.0)
+                chosen_symbol = timestep[0][0]
+                if chosen_symbol != " ":
+                    chosen_word += chosen_symbol
+            self.assertEqual(chosen_word, word)
+
+    @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
+    def test_result_iterator(self):
+        """Test result iterator."""
+        self._api.SetImageFile(self._image_file)
+        self._api.Recognize()
+        it = self._api.GetIterator()
+        level = tesserocr.RIL.WORD
+        for i, w in enumerate(tesserocr.iterate_level(it, level)):
+            text = w.GetUTF8Text(level)
+            blanks = w.BlanksBeforeWord()
+            if i == 0:
+                self.assertEqual(text, "The")
+                self.assertEqual(blanks, 0)
+            elif i == 1:
+                self.assertEqual(text, "(quick)")
+                self.assertEqual(blanks, 1)
+            else:
+                break
 
     def test_detect_os(self):
         """Test DetectOS and DetectOrientationScript (tesseract v4+)."""
diff --git a/tox.ini b/tox.ini
index 90648ae..79e4dc2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,6 +1,7 @@
 [tox]
 envlist=py27, py36, py37
 [testenv]
+setenv= LC_ALL = C
 passenv=TESSDATA_PREFIX
 deps=
     Pillow