From 9f72206c5cb9e01bac6717431f0edc328a7778ba Mon Sep 17 00:00:00 2001 From: sirfz Date: Fri, 30 Nov 2018 17:28:24 +0200 Subject: [PATCH] support BlanksBeforeWord API (#158) --- tesseract.pxd | 66 ++++++++++++++++++++++++++++++++--------------- tesserocr.pyx | 8 +++++- tests/test_api.py | 59 ++++++++++++++++++++++++++++-------------- tox.ini | 1 + 4 files changed, 92 insertions(+), 42 deletions(-) diff --git a/tesseract.pxd b/tesseract.pxd index 9431494..0169ff5 100644 --- a/tesseract.pxd +++ b/tesseract.pxd @@ -142,27 +142,51 @@ cdef extern from "tesseract/pageiterator.h" namespace "tesseract" nogil: void ParagraphInfo(TessParagraphJustification *, bool *, bool *, int *) const cdef extern from "tesseract/ltrresultiterator.h" namespace "tesseract" nogil: - cdef cppclass LTRResultIterator(PageIterator): - char *GetUTF8Text(PageIteratorLevel) const - void SetLineSeparator(cchar_t *) - void SetParagraphSeparator(cchar_t *) - float Confidence(PageIteratorLevel) const - cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const - cchar_t *WordRecognitionLanguage() const - StrongScriptDirection WordDirection() const - bool WordIsFromDictionary() const - bool WordIsNumeric() const - bool HasBlamerInfo() const - cchar_t *GetBlamerDebug() const - cchar_t *GetBlamerMisadaptionDebug() const - bool HasTruthString() const - bool EquivalentToTruth(cchar_t *) const - char *WordTruthUTF8Text() const - char *WordNormedUTF8Text() const - cchar_t *WordLattice(int *) const - bool SymbolIsSuperscript() const - bool SymbolIsSubscript() const - bool SymbolIsDropcap() const + IF TESSERACT_VERSION >= 0x4000000: + cdef cppclass LTRResultIterator(PageIterator): + char *GetUTF8Text(PageIteratorLevel) const + void SetLineSeparator(cchar_t *) + void SetParagraphSeparator(cchar_t *) + float Confidence(PageIteratorLevel) const + cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const + cchar_t *WordRecognitionLanguage() const + StrongScriptDirection WordDirection() const + bool WordIsFromDictionary() const + int BlanksBeforeWord() const + bool WordIsNumeric() const + bool HasBlamerInfo() const + cchar_t *GetBlamerDebug() const + cchar_t *GetBlamerMisadaptionDebug() const + bool HasTruthString() const + bool EquivalentToTruth(cchar_t *) const + char *WordTruthUTF8Text() const + char *WordNormedUTF8Text() const + cchar_t *WordLattice(int *) const + bool SymbolIsSuperscript() const + bool SymbolIsSubscript() const + bool SymbolIsDropcap() const + ELSE: + cdef cppclass LTRResultIterator(PageIterator): + char *GetUTF8Text(PageIteratorLevel) const + void SetLineSeparator(cchar_t *) + void SetParagraphSeparator(cchar_t *) + float Confidence(PageIteratorLevel) const + cchar_t *WordFontAttributes(bool *, bool *, bool *, bool *, bool *, bool *, int *, int *) const + cchar_t *WordRecognitionLanguage() const + StrongScriptDirection WordDirection() const + bool WordIsFromDictionary() const + bool WordIsNumeric() const + bool HasBlamerInfo() const + cchar_t *GetBlamerDebug() const + cchar_t *GetBlamerMisadaptionDebug() const + bool HasTruthString() const + bool EquivalentToTruth(cchar_t *) const + char *WordTruthUTF8Text() const + char *WordNormedUTF8Text() const + cchar_t *WordLattice(int *) const + bool SymbolIsSuperscript() const + bool SymbolIsSubscript() const + bool SymbolIsDropcap() const cdef cppclass ChoiceIterator: ChoiceIterator(const LTRResultIterator &) except + diff --git a/tesserocr.pyx b/tesserocr.pyx index 03fac00..637e63d 100644 --- a/tesserocr.pyx +++ b/tesserocr.pyx @@ -18,7 +18,7 @@ tesseract 3.04.00 ['eng', 'osd', 'equ']) """ -__version__ = '2.3.1' +__version__ = '2.4.0' import os from io import BytesIO @@ -896,6 +896,12 @@ cdef class PyLTRResultIterator(PyPageIterator): """Return True if the current word was found in a dictionary.""" return self._ltrriter.WordIsFromDictionary() + IF TESSERACT_VERSION >= 0x4000000: + def BlanksBeforeWord(self): + """Return True if the current word is numeric.""" + return self._ltrriter.BlanksBeforeWord() + + def WordIsNumeric(self): """Return True if the current word is numeric.""" return self._ltrriter.WordIsNumeric() diff --git a/tests/test_api.py b/tests/test_api.py index 73832f3..55319bc 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -177,27 +177,46 @@ def test_word_confidences(self): mapped_confidences = self._api.MapWordConfidences() self.assertEqual([v[0] for v in mapped_confidences], words) self.assertEqual([v[1] for v in mapped_confidences], confidences) - + + @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4") def test_LSTM_choices(self): - if _TESSERACT_VERSION >= 0x4000000: - """Test GetBestLSTMSymbolChoices.""" - self._api.SetVariable("lstm_choice_mode", "2") - self._api.SetImageFile(self._image_file) - self._api.Recognize() - LSTM_choices = self._api.GetBestLSTMSymbolChoices() - words = self._api.AllWords() - self.assertEqual(len(words), len(LSTM_choices)) - - for choice, word in zip(LSTM_choices, words): - chosen_word = "" - for timestep in choice: - for alternative in timestep: - self.assertGreaterEqual(alternative[1], 0.0) - self.assertLessEqual(alternative[1], 2.0) - chosen_symbol = timestep[0][0] - if chosen_symbol != " ": - chosen_word += chosen_symbol - self.assertEqual(chosen_word, word) + """Test GetBestLSTMSymbolChoices.""" + self._api.SetVariable("lstm_choice_mode", "2") + self._api.SetImageFile(self._image_file) + self._api.Recognize() + LSTM_choices = self._api.GetBestLSTMSymbolChoices() + words = self._api.AllWords() + self.assertEqual(len(words), len(LSTM_choices)) + + for choice, word in zip(LSTM_choices, words): + chosen_word = "" + for timestep in choice: + for alternative in timestep: + self.assertGreaterEqual(alternative[1], 0.0) + self.assertLessEqual(alternative[1], 2.0) + chosen_symbol = timestep[0][0] + if chosen_symbol != " ": + chosen_word += chosen_symbol + self.assertEqual(chosen_word, word) + + @unittest.skipIf(_TESSERACT_VERSION < 0x4000000, "tesseract < 4") + def test_result_iterator(self): + """Test result iterator.""" + self._api.SetImageFile(self._image_file) + self._api.Recognize() + it = self._api.GetIterator() + level = tesserocr.RIL.WORD + for i, w in enumerate(tesserocr.iterate_level(it, level)): + text = w.GetUTF8Text(level) + blanks = w.BlanksBeforeWord() + if i == 0: + self.assertEqual(text, "The") + self.assertEqual(blanks, 0) + elif i == 1: + self.assertEqual(text, "(quick)") + self.assertEqual(blanks, 1) + else: + break def test_detect_os(self): """Test DetectOS and DetectOrientationScript (tesseract v4+).""" diff --git a/tox.ini b/tox.ini index 90648ae..79e4dc2 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,7 @@ [tox] envlist=py27, py36, py37 [testenv] +setenv= LC_ALL = C passenv=TESSDATA_PREFIX deps= Pillow