Skip to content

Commit

Permalink
support BlanksBeforeWord API (#158)
Browse files Browse the repository at this point in the history
  • Loading branch information
sirfz committed Nov 30, 2018
1 parent 0381c05 commit 9f72206
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 42 deletions.
66 changes: 45 additions & 21 deletions tesseract.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -142,27 +142,51 @@ cdef extern from "tesseract/pageiterator.h" namespace "tesseract" nogil:
void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int *) const

cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil:
cdef cppclass LTRResultIterator(PageIterator):
char *GetUTF8Text(PageIteratorLevel) const
void SetLineSeparator(cchar_t *)
void SetParagraphSeparator(cchar_t *)
float Confidence(PageIteratorLevel) const
cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const
cchar_t *WordRecognitionLanguage() const
StrongScriptDirection WordDirection() const
bool WordIsFromDictionary() const
bool WordIsNumeric() const
bool HasBlamerInfo() const
cchar_t *GetBlamerDebug() const
cchar_t *GetBlamerMisadaptionDebug() const
bool HasTruthString() const
bool EquivalentToTruth(cchar_t *) const
char *WordTruthUTF8Text() const
char *WordNormedUTF8Text() const
cchar_t *WordLattice(int *) const
bool SymbolIsSuperscript() const
bool SymbolIsSubscript() const
bool SymbolIsDropcap() const
IF TESSERACT_VERSION >= 0x4000000:
cdef cppclass LTRResultIterator(PageIterator):
char *GetUTF8Text(PageIteratorLevel) const
void SetLineSeparator(cchar_t *)
void SetParagraphSeparator(cchar_t *)
float Confidence(PageIteratorLevel) const
cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const
cchar_t *WordRecognitionLanguage() const
StrongScriptDirection WordDirection() const
bool WordIsFromDictionary() const
int BlanksBeforeWord() const
bool WordIsNumeric() const
bool HasBlamerInfo() const
cchar_t *GetBlamerDebug() const
cchar_t *GetBlamerMisadaptionDebug() const
bool HasTruthString() const
bool EquivalentToTruth(cchar_t *) const
char *WordTruthUTF8Text() const
char *WordNormedUTF8Text() const
cchar_t *WordLattice(int *) const
bool SymbolIsSuperscript() const
bool SymbolIsSubscript() const
bool SymbolIsDropcap() const
ELSE:
cdef cppclass LTRResultIterator(PageIterator):
char *GetUTF8Text(PageIteratorLevel) const
void SetLineSeparator(cchar_t *)
void SetParagraphSeparator(cchar_t *)
float Confidence(PageIteratorLevel) const
cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const
cchar_t *WordRecognitionLanguage() const
StrongScriptDirection WordDirection() const
bool WordIsFromDictionary() const
bool WordIsNumeric() const
bool HasBlamerInfo() const
cchar_t *GetBlamerDebug() const
cchar_t *GetBlamerMisadaptionDebug() const
bool HasTruthString() const
bool EquivalentToTruth(cchar_t *) const
char *WordTruthUTF8Text() const
char *WordNormedUTF8Text() const
cchar_t *WordLattice(int *) const
bool SymbolIsSuperscript() const
bool SymbolIsSubscript() const
bool SymbolIsDropcap() const

cdef cppclass ChoiceIterator:
ChoiceIterator(const LTRResultIterator &) except +
Expand Down
8 changes: 7 additions & 1 deletion tesserocr.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ tesseract 3.04.00
['eng', 'osd', 'equ'])
"""

__version__ = '2.3.1'
__version__ = '2.4.0'

import os
from io import BytesIO
Expand Down Expand Up @@ -896,6 +896,12 @@ cdef class PyLTRResultIterator(PyPageIterator):
"""Return True if the current word was found in a dictionary."""
return self._ltrriter.WordIsFromDictionary()

IF TESSERACT_VERSION >= 0x4000000:
def BlanksBeforeWord(self):
"""Return True if the current word is numeric."""
return self._ltrriter.BlanksBeforeWord()


def WordIsNumeric(self):
"""Return True if the current word is numeric."""
return self._ltrriter.WordIsNumeric()
Expand Down
59 changes: 39 additions & 20 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,27 +177,46 @@ def test_word_confidences(self):
mapped_confidences = self._api.MapWordConfidences()
self.assertEqual([v[0] for v in mapped_confidences], words)
self.assertEqual([v[1] for v in mapped_confidences], confidences)


@unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
def test_LSTM_choices(self):
if _TESSERACT_VERSION >= 0x4000000:
"""Test GetBestLSTMSymbolChoices."""
self._api.SetVariable("lstm_choice_mode", "2")
self._api.SetImageFile(self._image_file)
self._api.Recognize()
LSTM_choices = self._api.GetBestLSTMSymbolChoices()
words = self._api.AllWords()
self.assertEqual(len(words), len(LSTM_choices))

for choice, word in zip(LSTM_choices, words):
chosen_word = ""
for timestep in choice:
for alternative in timestep:
self.assertGreaterEqual(alternative[1], 0.0)
self.assertLessEqual(alternative[1], 2.0)
chosen_symbol = timestep[0][0]
if chosen_symbol != " ":
chosen_word += chosen_symbol
self.assertEqual(chosen_word, word)
"""Test GetBestLSTMSymbolChoices."""
self._api.SetVariable("lstm_choice_mode", "2")
self._api.SetImageFile(self._image_file)
self._api.Recognize()
LSTM_choices = self._api.GetBestLSTMSymbolChoices()
words = self._api.AllWords()
self.assertEqual(len(words), len(LSTM_choices))

for choice, word in zip(LSTM_choices, words):
chosen_word = ""
for timestep in choice:
for alternative in timestep:
self.assertGreaterEqual(alternative[1], 0.0)
self.assertLessEqual(alternative[1], 2.0)
chosen_symbol = timestep[0][0]
if chosen_symbol != " ":
chosen_word += chosen_symbol
self.assertEqual(chosen_word, word)

@unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4")
def test_result_iterator(self):
"""Test result iterator."""
self._api.SetImageFile(self._image_file)
self._api.Recognize()
it = self._api.GetIterator()
level = tesserocr.RIL.WORD
for i, w in enumerate(tesserocr.iterate_level(it, level)):
text = w.GetUTF8Text(level)
blanks = w.BlanksBeforeWord()
if i == 0:
self.assertEqual(text, "The")
self.assertEqual(blanks, 0)
elif i == 1:
self.assertEqual(text, "(quick)")
self.assertEqual(blanks, 1)
else:
break

def test_detect_os(self):
"""Test DetectOS and DetectOrientationScript (tesseract v4+)."""
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[tox]
envlist=py27, py36, py37
[testenv]
setenv= LC_ALL = C
passenv=TESSDATA_PREFIX
deps=
Pillow
Expand Down

0 comments on commit 9f72206

Please sign in to comment.