From 84d8f6c503e3932c35e0d71850a5e35bfb7588e6 Mon Sep 17 00:00:00 2001 From: Bruce Robertson Date: Thu, 25 Oct 2018 09:12:29 -0300 Subject: [PATCH 1/6] add option to output elements to hocr --- ocropus-hocr | 76 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 3 deletions(-) diff --git a/ocropus-hocr b/ocropus-hocr index 43b7222f..23fc389a 100755 --- a/ocropus-hocr +++ b/ocropus-hocr @@ -32,6 +32,7 @@ parser.add_argument("-b","--nobreaks",action="store_true",help="don't output lin parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs") parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s") parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s") +parser.add_argument("-w","--ocrwords",action="store_true",help="output ocr_word spans in hocr") parser.add_argument('files',nargs='+') args = parser.parse_args() args.files = ocrolib.glob_all(args.files) @@ -125,8 +126,26 @@ for arg in args.files: text = re.sub(r'\&','\&',text) text = re.sub(r'\<','\<',text) - # accumulate information for each line here - + # accumulate character position information for each line if the + # user wants and if the llocs files are available + # (these are output by setting the --llocs switch on ocropus-gpageseg) + if (args.ocrwords and os.path.exists(lbase+".llocs")): + char_pos_data = codecs.open(lbase+".llocs",'r','utf-8').read() + lines = char_pos_data.split("\n") + #the last line is blank, providing no info + lines = lines[:-1] + char_coords = [] + for line in lines: + elements = line.split("\t") + to_coords = [elements[0],int(float(elements[1]))] + if not (elements[0] == ''): + char_coords.append(to_coords) + # remove final and initial spaces in lines, since they do not signify and they + # mess up word bboxes + if (char_coords[-1][0] == u" "): + char_coords = char_coords[:-1] + if (char_coords[0][0] == u" "): + char_coords = char_coords[1:] style = "" info = "" @@ -149,7 +168,58 @@ for arg in args.files: PN(""%info,text,"") + # use the data from the llocs files to provide elements and their + # content if the user so desires and if the collected data is not empty. For instance, + # if the line only contained a space character, it would be striped and the list would + # be empty + if args.ocrwords and (len(char_coords) > 0): + try: + PN(" class='ocr_line' title='%s'>"%info) + # loop through all the characters in the line, breaking to make a new ocr_word + # when we come to a space + # use the line coordinates to seed the word coordinates. In particular the y values + # are always used for word bounding boxes + word_x0 = x0 + word_y0 = y0 + word_x1 = x0 + word_y1 = y1 + current_word = u"" + # the last element is a special case, so we run a counter to be able to detect it + char_coords_count = len(char_coords) + count = 0 + # keep track of the last char's x in order to put the word x boundary in the middle of the + # space between words. This ensures that no part of the word gets omitted from the bbox + previous_char_x = 0 + for char_coord in char_coords: + current_char, char_x = char_coord + if (current_char == u" "): + midpoint = (char_x + previous_char_x) / 2 + word_x1 = midpoint + x0 + word_info="bbox %d %d %d %d"%(word_x0,word_y0,word_x1,word_y1) + PN(""%word_info,current_word," ") + # set the beginning x of the next word to the ending x of this one + word_x0 = word_x1 + # reset the accumulated characters in the word + current_word = u"" + elif (count == (char_coords_count-1)): + # in the case of the last character in the line: + # 1) the *line's* greatest x value is used as this element's, too + word_info="bbox %d %d %d %d"%(word_x0,word_y0,x1,word_y1) + # 2) no space is put after the word span + PN(""%word_info,current_word,"") + else: + # if the current character is not a space, then append it to current word + # which will be outputted in the ") + except: + E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-gpageseg?") + PN(" class='ocr_line' title='%s'>"%info,text,"") + else: + PN(" class='ocr_line' title='%s'>"%info,text,"") if not args.nobreaks: P("
") else: P() From 9a0d07bb94b8169b4d02eed944d0078752213fa9 Mon Sep 17 00:00:00 2001 From: Bruce Robertson Date: Thu, 25 Oct 2018 10:01:52 -0300 Subject: [PATCH 2/6] correct the origin of llocs file --- ocropus-hocr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocropus-hocr b/ocropus-hocr index 23fc389a..3e09aab4 100755 --- a/ocropus-hocr +++ b/ocropus-hocr @@ -128,7 +128,7 @@ for arg in args.files: # accumulate character position information for each line if the # user wants and if the llocs files are available - # (these are output by setting the --llocs switch on ocropus-gpageseg) + # (these are output by setting the --llocs switch on ocropus-rpred) if (args.ocrwords and os.path.exists(lbase+".llocs")): char_pos_data = codecs.open(lbase+".llocs",'r','utf-8').read() lines = char_pos_data.split("\n") @@ -216,7 +216,7 @@ for arg in args.files: previous_char_x = char_x PN("") except: - E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-gpageseg?") + E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-rpred?") PN(" class='ocr_line' title='%s'>"%info,text,"
") else: PN(" class='ocr_line' title='%s'>"%info,text,"") From 885fb17ee71aaf622ecfe5591ba9e08a99a879cd Mon Sep 17 00:00:00 2001 From: Bruce Robertson Date: Fri, 16 Nov 2018 11:15:25 -0400 Subject: [PATCH 3/6] deal with corner case which sometimes lops off final character in word --- ocropus-hocr | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ocropus-hocr b/ocropus-hocr index 3e09aab4..95537619 100755 --- a/ocropus-hocr +++ b/ocropus-hocr @@ -132,20 +132,22 @@ for arg in args.files: if (args.ocrwords and os.path.exists(lbase+".llocs")): char_pos_data = codecs.open(lbase+".llocs",'r','utf-8').read() lines = char_pos_data.split("\n") - #the last line is blank, providing no info - lines = lines[:-1] char_coords = [] for line in lines: elements = line.split("\t") - to_coords = [elements[0],int(float(elements[1]))] - if not (elements[0] == ''): - char_coords.append(to_coords) + #sometimes a line is blank, providing no info. We skip these + if (len(elements) == 2): + to_coords = [elements[0],int(float(elements[1]))] + if not (elements[0] == ''): + char_coords.append(to_coords) # remove final and initial spaces in lines, since they do not signify and they # mess up word bboxes if (char_coords[-1][0] == u" "): char_coords = char_coords[:-1] if (char_coords[0][0] == u" "): char_coords = char_coords[1:] + for char_coord in char_coords: + print char_coord style = "" info = "" @@ -206,7 +208,7 @@ for arg in args.files: # 1) the *line's* greatest x value is used as this element's, too word_info="bbox %d %d %d %d"%(word_x0,word_y0,x1,word_y1) # 2) no space is put after the word span - PN(""%word_info,current_word,"") + PN(""%word_info,current_word+current_char,"") else: # if the current character is not a space, then append it to current word # which will be outputted in the Date: Tue, 23 Jul 2019 11:04:43 -0300 Subject: [PATCH 4/6] use the edge of the space as the beginning of word. (Classifiers vary as to if they use leading or trailing edge so this is the best we can do.) --- ocropus-hocr | 28 +++++++++++++++++++++++----- ocropus-rtrain | 6 +++--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/ocropus-hocr b/ocropus-hocr index 95537619..ee81862c 100755 --- a/ocropus-hocr +++ b/ocropus-hocr @@ -8,10 +8,10 @@ import re import glob import argparse import codecs - +import unicodedata import numpy as np from matplotlib.pyplot import imread - +#from unicodedata import name import ocrolib from ocrolib import hocr @@ -146,8 +146,24 @@ for arg in args.files: char_coords = char_coords[:-1] if (char_coords[0][0] == u" "): char_coords = char_coords[1:] + #it seems rpred messes up when this happens, and gives the coord of the first real + #(non-space) character. So we need to bump them up + #print "we be spacing!" + #print "in" + #print char_coords + #b=[row[0] for row in char_coords] + #c=[row[1] for row in char_coords] + #b=b[1:] + #c=c[:-1] + #out = [] + #for x, y in zip(b, c): + #out = out + [[x,y]] + #char_coords=out + #print "out" + #print char_coords for char_coord in char_coords: - print char_coord + print char_coord[0] + " " + unicodedata.name(char_coord[0][0]) + " " + str(char_coord[1]) + #print "'", name(char_cood[0]) style = "" info = "" @@ -192,11 +208,12 @@ for arg in args.files: # keep track of the last char's x in order to put the word x boundary in the middle of the # space between words. This ensures that no part of the word gets omitted from the bbox previous_char_x = 0 + previous_char = u"" for char_coord in char_coords: current_char, char_x = char_coord - if (current_char == u" "): + if (current_char == u" "): midpoint = (char_x + previous_char_x) / 2 - word_x1 = midpoint + x0 + word_x1 = char_x + x0 word_info="bbox %d %d %d %d"%(word_x0,word_y0,word_x1,word_y1) PN(""%word_info,current_word," ") # set the beginning x of the next word to the ending x of this one @@ -216,6 +233,7 @@ for arg in args.files: current_word = current_word + current_char count = count + 1 previous_char_x = char_x + previous_char = current_char PN("") except: E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-rpred?") diff --git a/ocropus-rtrain b/ocropus-rtrain index 1e92e3cb..5c871072 100755 --- a/ocropus-rtrain +++ b/ocropus-rtrain @@ -315,9 +315,9 @@ for trial in range(start,args.ntrain): gta = "".join(codec.decode(acs)) if not args.quiet: print("%d %.2f %s" % (trial, network.error, line.shape), fname) - print(" TRU:", repr(transcript)) - print(" ALN:", repr(gta[:len(transcript)+5])) - print(" OUT:", repr(pred[:len(transcript)+5])) + print(" TRU:", transcript) + print(" ALN:", gta[:len(transcript)+5]) + print(" OUT:", pred[:len(transcript)+5]) pred = re.sub(' ','_',pred) gta = re.sub(' ','_',gta) From 1f5e8af7b389cd01de89c0f8dea7bd8733cfa8f3 Mon Sep 17 00:00:00 2001 From: Bruce Robertson Date: Tue, 23 Jul 2019 11:05:15 -0300 Subject: [PATCH 5/6] add more char substitutions --- ocrolib/chars.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ocrolib/chars.py b/ocrolib/chars.py index 8ee82c5c..112c006f 100644 --- a/ocrolib/chars.py +++ b/ocrolib/chars.py @@ -38,12 +38,12 @@ (u'[_~#]',u"~"), # OCR control characters (u'"',u"''"), # typewriter double quote (u"`",u"'"), # grave accent - (u'[“”]',u"''"), # fancy quotes +# (u'[“”]',u"''"), # fancy quotes (u"´",u"'"), # acute accent - (u"[‘’]",u"'"), # left single quotation mark - (u"[“”]",u"''"), # right double quotation mark - (u"“",u"''"), # German quotes - (u"„",u",,"), # German quotes +# (u"[‘’]",u"'"), # left single quotation mark +# (u"[“”]",u"''"), # right double quotation mark +# (u"“",u"''"), # German quotes +# (u"„",u",,"), # German quotes (u"…",u"..."), # ellipsis (u"′",u"'"), # prime (u"″",u"''"), # double prime @@ -51,11 +51,11 @@ (u"〃",u"''"), # ditto mark (u"µ",u"μ"), # replace micro unit with greek character (u"[–—]",u"-"), # variant length hyphens - (u"fl",u"fl"), # expand Unicode ligatures - (u"fi",u"fi"), - (u"ff",u"ff"), - (u"ffi",u"ffi"), - (u"ffl",u"ffl"), +# (u"fl",u"fl"), # expand Unicode ligatures +# (u"fi",u"fi"), +# (u"ff",u"ff"), +# (u"ffi",u"ffi"), +# (u"ffl",u"ffl"), ] def requote(s): From 2619e62bb35aa4eb620f87e79091687d31591ffc Mon Sep 17 00:00:00 2001 From: Bruce Robertson Date: Tue, 23 Jul 2019 11:05:38 -0300 Subject: [PATCH 6/6] use decomposed unicode always --- ocrolib/common.py | 2 +- ocrolib/lstm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrolib/common.py b/ocrolib/common.py index 4b5ee87c..7c4f5c8e 100644 --- a/ocrolib/common.py +++ b/ocrolib/common.py @@ -48,7 +48,7 @@ def normalize_text(s): This eliminates common ambiguities and weird unicode characters.""" s = unicode(s) - s = unicodedata.normalize('NFC',s) + s = unicodedata.normalize('NFD',s) s = re.sub(ur'\s+(?u)',' ',s) s = re.sub(ur'\n(?u)','',s) s = re.sub(ur'^\s+(?u)','',s) diff --git a/ocrolib/lstm.py b/ocrolib/lstm.py index f5307590..709350d6 100644 --- a/ocrolib/lstm.py +++ b/ocrolib/lstm.py @@ -838,7 +838,7 @@ def ctc_align_targets(outputs,targets,threshold=100.0,verbose=0,debug=0,lo=1e-5) return aligned def normalize_nfkc(s): - return unicodedata.normalize('NFKC',s) + return unicodedata.normalize('NFD',s) def add_training_info(network): return network