ocropus-archive · brobertson · Oct 25, 2018 · Oct 25, 2018 · Nov 16, 2018 · Jul 23, 2019
diff --git a/ocrolib/chars.py b/ocrolib/chars.py
@@ -38,24 +38,24 @@
     (u'[_~#]',u"~"), # OCR control characters
     (u'"',u"''"), # typewriter double quote
     (u"`",u"'"), # grave accent
-    (u'[“”]',u"''"), # fancy quotes
+#    (u'[“”]',u"''"), # fancy quotes
     (u"´",u"'"), # acute accent
-    (u"[‘’]",u"'"), # left single quotation mark
-    (u"[“”]",u"''"), # right double quotation mark
-    (u"“",u"''"), # German quotes
-    (u"„",u",,"), # German quotes
+#    (u"[‘’]",u"'"), # left single quotation mark
+#    (u"[“”]",u"''"), # right double quotation mark
+#    (u"“",u"''"), # German quotes
+#    (u"„",u",,"), # German quotes
     (u"…",u"..."), # ellipsis
     (u"′",u"'"), # prime
     (u"″",u"''"), # double prime
     (u"‴",u"'''"), # triple prime
     (u"〃",u"''"), # ditto mark
     (u"µ",u"μ"), # replace micro unit with greek character
     (u"[–—]",u"-"), # variant length hyphens
-    (u"ﬂ",u"fl"), # expand Unicode ligatures
-    (u"ﬁ",u"fi"),
-    (u"ﬀ",u"ff"),
-    (u"ﬃ",u"ffi"),
-    (u"ﬄ",u"ffl"),
+#    (u"ﬂ",u"fl"), # expand Unicode ligatures
+#    (u"ﬁ",u"fi"),
+#    (u"ﬀ",u"ff"),
+#    (u"ﬃ",u"ffi"),
+#    (u"ﬄ",u"ffl"),
 ]
 
 def requote(s):

diff --git a/ocrolib/common.py b/ocrolib/common.py
@@ -48,7 +48,7 @@ def normalize_text(s):
     This eliminates common ambiguities and weird unicode
     characters."""
     s = unicode(s)
-    s = unicodedata.normalize('NFC',s)
+    s = unicodedata.normalize('NFD',s)
     s = re.sub(ur'\s+(?u)',' ',s)
     s = re.sub(ur'\n(?u)','',s)
     s = re.sub(ur'^\s+(?u)','',s)

diff --git a/ocrolib/lstm.py b/ocrolib/lstm.py
@@ -838,7 +838,7 @@ def ctc_align_targets(outputs,targets,threshold=100.0,verbose=0,debug=0,lo=1e-5)
     return aligned
 
 def normalize_nfkc(s):
-    return unicodedata.normalize('NFKC',s)
+    return unicodedata.normalize('NFD',s)
 
 def add_training_info(network):
     return network

diff --git a/ocropus-hocr b/ocropus-hocr
@@ -8,10 +8,10 @@ import re
 import glob
 import argparse
 import codecs
-
+import unicodedata
 import numpy as np
 from matplotlib.pyplot import imread
-
+#from unicodedata import name
 import ocrolib
 from ocrolib import hocr
 
@@ -32,6 +32,7 @@ parser.add_argument("-b","--nobreaks",action="store_true",help="don't output lin
 parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
 parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
 parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
+parser.add_argument("-w","--ocrwords",action="store_true",help="output ocr_word spans in hocr")
 parser.add_argument('files',nargs='+')
 args = parser.parse_args()
 args.files = ocrolib.glob_all(args.files)
@@ -125,8 +126,44 @@ for arg in args.files:
             text = re.sub(r'\&','\&amp;',text)
             text = re.sub(r'\<','\&lt;',text)
 
-            # accumulate information for each line here
-
+            # accumulate character position information for each line if the 
+            # user wants <span char="ocr_word"> and if the llocs files are available
+            # (these are output by setting the --llocs switch on ocropus-rpred) 
+            if (args.ocrwords and os.path.exists(lbase+".llocs")):
+                char_pos_data =  codecs.open(lbase+".llocs",'r','utf-8').read()
+                lines = char_pos_data.split("\n")
+                char_coords = []
+                for line in lines:
+                    elements = line.split("\t")
+                    #sometimes a line is blank, providing no info. We skip these
+                    if (len(elements) == 2):
+                        to_coords = [elements[0],int(float(elements[1]))]
+                        if not (elements[0] == ''):
+                            char_coords.append(to_coords)
+                # remove final and initial spaces in lines, since they do not signify and they
+                # mess up word bboxes
+                if (char_coords[-1][0] == u" "):
+                    char_coords = char_coords[:-1]
+                if (char_coords[0][0] == u" "):
+                    char_coords = char_coords[1:]
+                    #it seems rpred messes up when this happens, and gives the coord of the first real
+                    #(non-space) character. So we need to bump them up
+                    #print "we be spacing!"
+                    #print "in"
+                    #print char_coords
+                    #b=[row[0] for row in char_coords]
+                    #c=[row[1] for row in char_coords]
+                    #b=b[1:]
+                    #c=c[:-1]
+                    #out = []
+                    #for x, y in zip(b, c):
+                       #out = out + [[x,y]]
+                    #char_coords=out
+                    #print "out"
+                    #print char_coords
+                for char_coord in char_coords:
+                    print    char_coord[0] + " " + unicodedata.name(char_coord[0][0]) + " " + str(char_coord[1])
+                    #print "'", name(char_cood[0])
             style = ""
             info = ""
 
@@ -149,7 +186,60 @@ for arg in args.files:
 
             PN("<span")
             if style!="": PN(" style='"+style+"'")
-            PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
+            # use the data from the llocs files to provide <span class="ocr_word"> elements and their
+            # content if the user so desires and if the collected data is not empty. For instance,
+            # if the line only contained a space character, it would be striped and the list would
+            # be empty
+            if args.ocrwords and (len(char_coords) > 0):
+		    try:
+			PN(" class='ocr_line' title='%s'>"%info)
+			# loop through all the characters in the line, breaking to make a new ocr_word
+			# when we come to a space
+                        # use the line coordinates to seed the word coordinates. In particular the y values
+                        # are always used for word bounding boxes
+			word_x0 = x0
+			word_y0 = y0
+			word_x1 = x0
+			word_y1 = y1
+			current_word = u""
+                        # the last element is a special case, so we run a counter to be able to detect it
+			char_coords_count = len(char_coords)
+			count = 0 
+                        # keep track of the last char's x in order to put the word x boundary in the middle of the 
+                        # space between words. This ensures that no part of the word gets omitted from the bbox
+                        previous_char_x = 0
+                        previous_char = u""
+			for char_coord in char_coords:
+			    current_char, char_x = char_coord
+                            if (current_char == u" "):
+                                midpoint = (char_x + previous_char_x) / 2
+				word_x1 = char_x + x0
+				word_info="bbox %d %d %d %d"%(word_x0,word_y0,word_x1,word_y1)
+				PN("<span class='ocr_word' title='%s'>"%word_info,current_word,"</span> ")
+				# set the beginning x of the next word to the ending x of this one
+                                word_x0 = word_x1
+                                # reset the accumulated characters in the word
+				current_word = u""
+                            elif (count == (char_coords_count-1)):
+                                # in the case of the last character in the line:
+                                # 1) the *line's* greatest x value is used as this element's, too
+                                word_info="bbox %d %d %d %d"%(word_x0,word_y0,x1,word_y1)
+                                # 2) no space is put after the word span
+                                PN("<span class='ocr_word' title='%s'>"%word_info,current_word+current_char,"</span>")
+			    else:
+                                # if the current character is not a space, then append it to current word
+                                # which will be outputted in the <span class="ocr_word" when a space *is*
+                                # encountered
+				current_word = current_word + current_char
+			    count = count + 1
+                            previous_char_x = char_x
+                            previous_char = current_char
+			PN("</span>")
+		    except:
+			E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-rpred?")
+			PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
+            else:
+                 PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
             if not args.nobreaks: P("<br />")
             else: P()
 

diff --git a/ocropus-rtrain b/ocropus-rtrain
@@ -315,9 +315,9 @@ for trial in range(start,args.ntrain):
     gta = "".join(codec.decode(acs))
     if not args.quiet:
         print("%d %.2f %s" % (trial, network.error, line.shape), fname)
-        print("   TRU:", repr(transcript))
-        print("   ALN:", repr(gta[:len(transcript)+5]))
-        print("   OUT:", repr(pred[:len(transcript)+5]))
+        print("   TRU:", transcript)
+        print("   ALN:", gta[:len(transcript)+5])
+        print("   OUT:", pred[:len(transcript)+5])
 
     pred = re.sub(' ','_',pred)
     gta = re.sub(' ','_',gta)