From 84d8f6c503e3932c35e0d71850a5e35bfb7588e6 Mon Sep 17 00:00:00 2001
From: Bruce Robertson <brobertson@mta.ca>
Date: Thu, 25 Oct 2018 09:12:29 -0300
Subject: [PATCH 1/6] add option to output <span class=ocr_word> elements to
 hocr

---
 ocropus-hocr | 76 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 3 deletions(-)
diff --git a/ocropus-hocr b/ocropus-hocr
index 43b7222f..23fc389a 100755
--- a/ocropus-hocr
+++ b/ocropus-hocr
@@ -32,6 +32,7 @@ parser.add_argument("-b","--nobreaks",action="store_true",help="don't output lin
 parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
 parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
 parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
+parser.add_argument("-w","--ocrwords",action="store_true",help="output ocr_word spans in hocr")
 parser.add_argument('files',nargs='+')
 args = parser.parse_args()
 args.files = ocrolib.glob_all(args.files)
@@ -125,8 +126,26 @@ for arg in args.files:
             text = re.sub(r'\&','\&amp;',text)
             text = re.sub(r'\<','\&lt;',text)
 
-            # accumulate information for each line here
-
+            # accumulate character position information for each line if the 
+            # user wants <span char="ocr_word"> and if the llocs files are available
+            # (these are output by setting the --llocs switch on ocropus-gpageseg) 
+            if (args.ocrwords and os.path.exists(lbase+".llocs")):
+                char_pos_data =  codecs.open(lbase+".llocs",'r','utf-8').read()
+                lines = char_pos_data.split("\n")
+                #the last line is blank, providing no info
+                lines = lines[:-1]
+                char_coords = []
+                for line in lines:
+                    elements = line.split("\t")
+                    to_coords = [elements[0],int(float(elements[1]))]
+                    if not (elements[0] == ''):
+                        char_coords.append(to_coords)
+                # remove final and initial spaces in lines, since they do not signify and they
+                # mess up word bboxes
+                if (char_coords[-1][0] == u" "):
+                    char_coords = char_coords[:-1]
+                if (char_coords[0][0] == u" "):
+                    char_coords = char_coords[1:]
             style = ""
             info = ""
 
@@ -149,7 +168,58 @@ for arg in args.files:
 
             PN("<span")
             if style!="": PN(" style='"+style+"'")
-            PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
+            # use the data from the llocs files to provide <span class="ocr_word"> elements and their
+            # content if the user so desires and if the collected data is not empty. For instance,
+            # if the line only contained a space character, it would be striped and the list would
+            # be empty
+            if args.ocrwords and (len(char_coords) > 0):
+		    try:
+			PN(" class='ocr_line' title='%s'>"%info)
+			# loop through all the characters in the line, breaking to make a new ocr_word
+			# when we come to a space
+                        # use the line coordinates to seed the word coordinates. In particular the y values
+                        # are always used for word bounding boxes
+			word_x0 = x0
+			word_y0 = y0
+			word_x1 = x0
+			word_y1 = y1
+			current_word = u""
+                        # the last element is a special case, so we run a counter to be able to detect it
+			char_coords_count = len(char_coords)
+			count = 0 
+                        # keep track of the last char's x in order to put the word x boundary in the middle of the 
+                        # space between words. This ensures that no part of the word gets omitted from the bbox
+                        previous_char_x = 0
+			for char_coord in char_coords:
+			    current_char, char_x = char_coord
+			    if (current_char == u" "):
+                                midpoint = (char_x + previous_char_x) / 2
+				word_x1 = midpoint + x0
+				word_info="bbox %d %d %d %d"%(word_x0,word_y0,word_x1,word_y1)
+				PN("<span class='ocr_word' title='%s'>"%word_info,current_word,"</span> ")
+				# set the beginning x of the next word to the ending x of this one
+                                word_x0 = word_x1
+                                # reset the accumulated characters in the word
+				current_word = u""
+                            elif (count == (char_coords_count-1)):
+                                # in the case of the last character in the line:
+                                # 1) the *line's* greatest x value is used as this element's, too
+                                word_info="bbox %d %d %d %d"%(word_x0,word_y0,x1,word_y1)
+                                # 2) no space is put after the word span
+                                PN("<span class='ocr_word' title='%s'>"%word_info,current_word,"</span>")
+			    else:
+                                # if the current character is not a space, then append it to current word
+                                # which will be outputted in the <span class="ocr_word" when a space *is*
+                                # encountered
+				current_word = current_word + current_char
+			    count = count + 1
+                            previous_char_x = char_x
+			PN("</span>")
+		    except:
+			E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-gpageseg?")
+			PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
+            else:
+                 PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
             if not args.nobreaks: P("<br />")
             else: P()
 

From 9a0d07bb94b8169b4d02eed944d0078752213fa9 Mon Sep 17 00:00:00 2001
From: Bruce Robertson <brobertson@mta.ca>
Date: Thu, 25 Oct 2018 10:01:52 -0300
Subject: [PATCH 2/6] correct the origin of llocs file

---
 ocropus-hocr | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocropus-hocr b/ocropus-hocr
index 23fc389a..3e09aab4 100755
--- a/ocropus-hocr
+++ b/ocropus-hocr
@@ -128,7 +128,7 @@ for arg in args.files:
 
             # accumulate character position information for each line if the 
             # user wants <span char="ocr_word"> and if the llocs files are available
-            # (these are output by setting the --llocs switch on ocropus-gpageseg) 
+            # (these are output by setting the --llocs switch on ocropus-rpred) 
             if (args.ocrwords and os.path.exists(lbase+".llocs")):
                 char_pos_data =  codecs.open(lbase+".llocs",'r','utf-8').read()
                 lines = char_pos_data.split("\n")
@@ -216,7 +216,7 @@ for arg in args.files:
                             previous_char_x = char_x
 			PN("</span>")
 		    except:
-			E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-gpageseg?")
+			E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-rpred?")
 			PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
             else:
                  PN(" class='ocr_line' title='%s'>"%info,text,"</span>")

From 885fb17ee71aaf622ecfe5591ba9e08a99a879cd Mon Sep 17 00:00:00 2001
From: Bruce Robertson <brobertson@mta.ca>
Date: Fri, 16 Nov 2018 11:15:25 -0400
Subject: [PATCH 3/6] deal with corner case which sometimes lops off final
 character in word

---
 ocropus-hocr | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/ocropus-hocr b/ocropus-hocr
index 3e09aab4..95537619 100755
--- a/ocropus-hocr
+++ b/ocropus-hocr
@@ -132,20 +132,22 @@ for arg in args.files:
             if (args.ocrwords and os.path.exists(lbase+".llocs")):
                 char_pos_data =  codecs.open(lbase+".llocs",'r','utf-8').read()
                 lines = char_pos_data.split("\n")
-                #the last line is blank, providing no info
-                lines = lines[:-1]
                 char_coords = []
                 for line in lines:
                     elements = line.split("\t")
-                    to_coords = [elements[0],int(float(elements[1]))]
-                    if not (elements[0] == ''):
-                        char_coords.append(to_coords)
+                    #sometimes a line is blank, providing no info. We skip these
+                    if (len(elements) == 2):
+                        to_coords = [elements[0],int(float(elements[1]))]
+                        if not (elements[0] == ''):
+                            char_coords.append(to_coords)
                 # remove final and initial spaces in lines, since they do not signify and they
                 # mess up word bboxes
                 if (char_coords[-1][0] == u" "):
                     char_coords = char_coords[:-1]
                 if (char_coords[0][0] == u" "):
                     char_coords = char_coords[1:]
+                for char_coord in char_coords:
+                    print char_coord
             style = ""
             info = ""
 
@@ -206,7 +208,7 @@ for arg in args.files:
                                 # 1) the *line's* greatest x value is used as this element's, too
                                 word_info="bbox %d %d %d %d"%(word_x0,word_y0,x1,word_y1)
                                 # 2) no space is put after the word span
-                                PN("<span class='ocr_word' title='%s'>"%word_info,current_word,"</span>")
+                                PN("<span class='ocr_word' title='%s'>"%word_info,current_word+current_char,"</span>")
 			    else:
                                 # if the current character is not a space, then append it to current word
                                 # which will be outputted in the <span class="ocr_word" when a space *is*

From 403c07cbbe4e0e314f64af123ae7fb8c181af516 Mon Sep 17 00:00:00 2001
From: Bruce Robertson <brobertson@mta.ca>
Date: Tue, 23 Jul 2019 11:04:43 -0300
Subject: [PATCH 4/6] use the edge of the space as the beginning of word.
 (Classifiers vary as to if they use leading or trailing edge so this is the
 best we can do.)

---
 ocropus-hocr   | 28 +++++++++++++++++++++++-----
 ocropus-rtrain |  6 +++---
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/ocropus-hocr b/ocropus-hocr
index 95537619..ee81862c 100755
--- a/ocropus-hocr
+++ b/ocropus-hocr
@@ -8,10 +8,10 @@ import re
 import glob
 import argparse
 import codecs
-
+import unicodedata
 import numpy as np
 from matplotlib.pyplot import imread
-
+#from unicodedata import name
 import ocrolib
 from ocrolib import hocr
 
@@ -146,8 +146,24 @@ for arg in args.files:
                     char_coords = char_coords[:-1]
                 if (char_coords[0][0] == u" "):
                     char_coords = char_coords[1:]
+                    #it seems rpred messes up when this happens, and gives the coord of the first real
+                    #(non-space) character. So we need to bump them up
+                    #print "we be spacing!"
+                    #print "in"
+                    #print char_coords
+                    #b=[row[0] for row in char_coords]
+                    #c=[row[1] for row in char_coords]
+                    #b=b[1:]
+                    #c=c[:-1]
+                    #out = []
+                    #for x, y in zip(b, c):
+                       #out = out + [[x,y]]
+                    #char_coords=out
+                    #print "out"
+                    #print char_coords
                 for char_coord in char_coords:
-                    print char_coord
+                    print    char_coord[0] + " " + unicodedata.name(char_coord[0][0]) + " " + str(char_coord[1])
+                    #print "'", name(char_cood[0])
             style = ""
             info = ""
 
@@ -192,11 +208,12 @@ for arg in args.files:
                         # keep track of the last char's x in order to put the word x boundary in the middle of the 
                         # space between words. This ensures that no part of the word gets omitted from the bbox
                         previous_char_x = 0
+                        previous_char = u""
 			for char_coord in char_coords:
 			    current_char, char_x = char_coord
-			    if (current_char == u" "):
+                            if (current_char == u" "):
                                 midpoint = (char_x + previous_char_x) / 2
-				word_x1 = midpoint + x0
+				word_x1 = char_x + x0
 				word_info="bbox %d %d %d %d"%(word_x0,word_y0,word_x1,word_y1)
 				PN("<span class='ocr_word' title='%s'>"%word_info,current_word,"</span> ")
 				# set the beginning x of the next word to the ending x of this one
@@ -216,6 +233,7 @@ for arg in args.files:
 				current_word = current_word + current_char
 			    count = count + 1
                             previous_char_x = char_x
+                            previous_char = current_char
 			PN("</span>")
 		    except:
 			E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-rpred?")
diff --git a/ocropus-rtrain b/ocropus-rtrain
index 1e92e3cb..5c871072 100755
--- a/ocropus-rtrain
+++ b/ocropus-rtrain
@@ -315,9 +315,9 @@ for trial in range(start,args.ntrain):
     gta = "".join(codec.decode(acs))
     if not args.quiet:
         print("%d %.2f %s" % (trial, network.error, line.shape), fname)
-        print("   TRU:", repr(transcript))
-        print("   ALN:", repr(gta[:len(transcript)+5]))
-        print("   OUT:", repr(pred[:len(transcript)+5]))
+        print("   TRU:", transcript)
+        print("   ALN:", gta[:len(transcript)+5])
+        print("   OUT:", pred[:len(transcript)+5])
 
     pred = re.sub(' ','_',pred)
     gta = re.sub(' ','_',gta)

From 1f5e8af7b389cd01de89c0f8dea7bd8733cfa8f3 Mon Sep 17 00:00:00 2001
From: Bruce Robertson <brobertson@mta.ca>
Date: Tue, 23 Jul 2019 11:05:15 -0300
Subject: [PATCH 5/6] add more char substitutions

---
 ocrolib/chars.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ocrolib/chars.py b/ocrolib/chars.py
index 8ee82c5c..112c006f 100644
--- a/ocrolib/chars.py
+++ b/ocrolib/chars.py
@@ -38,12 +38,12 @@
     (u'[_~#]',u"~"), # OCR control characters
     (u'"',u"''"), # typewriter double quote
     (u"`",u"'"), # grave accent
-    (u'[“”]',u"''"), # fancy quotes
+#    (u'[“”]',u"''"), # fancy quotes
     (u"´",u"'"), # acute accent
-    (u"[‘’]",u"'"), # left single quotation mark
-    (u"[“”]",u"''"), # right double quotation mark
-    (u"“",u"''"), # German quotes
-    (u"„",u",,"), # German quotes
+#    (u"[‘’]",u"'"), # left single quotation mark
+#    (u"[“”]",u"''"), # right double quotation mark
+#    (u"“",u"''"), # German quotes
+#    (u"„",u",,"), # German quotes
     (u"…",u"..."), # ellipsis
     (u"′",u"'"), # prime
     (u"″",u"''"), # double prime
@@ -51,11 +51,11 @@
     (u"〃",u"''"), # ditto mark
     (u"µ",u"μ"), # replace micro unit with greek character
     (u"[–—]",u"-"), # variant length hyphens
-    (u"ﬂ",u"fl"), # expand Unicode ligatures
-    (u"ﬁ",u"fi"),
-    (u"ﬀ",u"ff"),
-    (u"ﬃ",u"ffi"),
-    (u"ﬄ",u"ffl"),
+#    (u"ﬂ",u"fl"), # expand Unicode ligatures
+#    (u"ﬁ",u"fi"),
+#    (u"ﬀ",u"ff"),
+#    (u"ﬃ",u"ffi"),
+#    (u"ﬄ",u"ffl"),
 ]
 
 def requote(s):

From 2619e62bb35aa4eb620f87e79091687d31591ffc Mon Sep 17 00:00:00 2001
From: Bruce Robertson <brobertson@mta.ca>
Date: Tue, 23 Jul 2019 11:05:38 -0300
Subject: [PATCH 6/6] use decomposed unicode always

---
 ocrolib/common.py | 2 +-
 ocrolib/lstm.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocrolib/common.py b/ocrolib/common.py
index 4b5ee87c..7c4f5c8e 100644
--- a/ocrolib/common.py
+++ b/ocrolib/common.py
@@ -48,7 +48,7 @@ def normalize_text(s):
     This eliminates common ambiguities and weird unicode
     characters."""
     s = unicode(s)
-    s = unicodedata.normalize('NFC',s)
+    s = unicodedata.normalize('NFD',s)
     s = re.sub(ur'\s+(?u)',' ',s)
     s = re.sub(ur'\n(?u)','',s)
     s = re.sub(ur'^\s+(?u)','',s)
diff --git a/ocrolib/lstm.py b/ocrolib/lstm.py
index f5307590..709350d6 100644
--- a/ocrolib/lstm.py
+++ b/ocrolib/lstm.py
@@ -838,7 +838,7 @@ def ctc_align_targets(outputs,targets,threshold=100.0,verbose=0,debug=0,lo=1e-5)
     return aligned
 
 def normalize_nfkc(s):
-    return unicodedata.normalize('NFKC',s)
+    return unicodedata.normalize('NFD',s)
 
 def add_training_info(network):
     return network