Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to output <span class=ocr_word> elements to hocr #314

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions ocrolib/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,24 @@
(u'[_~#]',u"~"), # OCR control characters
(u'"',u"''"), # typewriter double quote
(u"`",u"'"), # grave accent
(u'[“”]',u"''"), # fancy quotes
# (u'[“”]',u"''"), # fancy quotes
(u"´",u"'"), # acute accent
(u"[‘’]",u"'"), # left single quotation mark
(u"[“”]",u"''"), # right double quotation mark
(u"“",u"''"), # German quotes
(u"„",u",,"), # German quotes
# (u"[‘’]",u"'"), # left single quotation mark
# (u"[“”]",u"''"), # right double quotation mark
# (u"“",u"''"), # German quotes
# (u"„",u",,"), # German quotes
(u"…",u"..."), # ellipsis
(u"′",u"'"), # prime
(u"″",u"''"), # double prime
(u"‴",u"'''"), # triple prime
(u"〃",u"''"), # ditto mark
(u"µ",u"μ"), # replace micro unit with greek character
(u"[–—]",u"-"), # variant length hyphens
(u"fl",u"fl"), # expand Unicode ligatures
(u"fi",u"fi"),
(u"ff",u"ff"),
(u"ffi",u"ffi"),
(u"ffl",u"ffl"),
# (u"fl",u"fl"), # expand Unicode ligatures
# (u"fi",u"fi"),
# (u"ff",u"ff"),
# (u"ffi",u"ffi"),
# (u"ffl",u"ffl"),
]

def requote(s):
Expand Down
2 changes: 1 addition & 1 deletion ocrolib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def normalize_text(s):
This eliminates common ambiguities and weird unicode
characters."""
s = unicode(s)
s = unicodedata.normalize('NFC',s)
s = unicodedata.normalize('NFD',s)
s = re.sub(ur'\s+(?u)',' ',s)
s = re.sub(ur'\n(?u)','',s)
s = re.sub(ur'^\s+(?u)','',s)
Expand Down
2 changes: 1 addition & 1 deletion ocrolib/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,7 @@ def ctc_align_targets(outputs,targets,threshold=100.0,verbose=0,debug=0,lo=1e-5)
return aligned

def normalize_nfkc(s):
return unicodedata.normalize('NFKC',s)
return unicodedata.normalize('NFD',s)

def add_training_info(network):
return network
Expand Down
100 changes: 95 additions & 5 deletions ocropus-hocr
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import re
import glob
import argparse
import codecs

import unicodedata
import numpy as np
from matplotlib.pyplot import imread

#from unicodedata import name
import ocrolib
from ocrolib import hocr

Expand All @@ -32,6 +32,7 @@ parser.add_argument("-b","--nobreaks",action="store_true",help="don't output lin
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
parser.add_argument("-w","--ocrwords",action="store_true",help="output ocr_word spans in hocr")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)
Expand Down Expand Up @@ -125,8 +126,44 @@ for arg in args.files:
text = re.sub(r'\&','\&amp;',text)
text = re.sub(r'\<','\&lt;',text)

# accumulate information for each line here

# accumulate character position information for each line if the
# user wants <span char="ocr_word"> and if the llocs files are available
# (these are output by setting the --llocs switch on ocropus-rpred)
if (args.ocrwords and os.path.exists(lbase+".llocs")):
char_pos_data = codecs.open(lbase+".llocs",'r','utf-8').read()
lines = char_pos_data.split("\n")
char_coords = []
for line in lines:
elements = line.split("\t")
#sometimes a line is blank, providing no info. We skip these
if (len(elements) == 2):
to_coords = [elements[0],int(float(elements[1]))]
if not (elements[0] == ''):
char_coords.append(to_coords)
# remove final and initial spaces in lines, since they do not signify and they
# mess up word bboxes
if (char_coords[-1][0] == u" "):
char_coords = char_coords[:-1]
if (char_coords[0][0] == u" "):
char_coords = char_coords[1:]
#it seems rpred messes up when this happens, and gives the coord of the first real
#(non-space) character. So we need to bump them up
#print "we be spacing!"
#print "in"
#print char_coords
#b=[row[0] for row in char_coords]
#c=[row[1] for row in char_coords]
#b=b[1:]
#c=c[:-1]
#out = []
#for x, y in zip(b, c):
#out = out + [[x,y]]
#char_coords=out
#print "out"
#print char_coords
for char_coord in char_coords:
print char_coord[0] + " " + unicodedata.name(char_coord[0][0]) + " " + str(char_coord[1])
#print "'", name(char_cood[0])
style = ""
info = ""

Expand All @@ -149,7 +186,60 @@ for arg in args.files:

PN("<span")
if style!="": PN(" style='"+style+"'")
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
# use the data from the llocs files to provide <span class="ocr_word"> elements and their
# content if the user so desires and if the collected data is not empty. For instance,
# if the line only contained a space character, it would be striped and the list would
# be empty
if args.ocrwords and (len(char_coords) > 0):
try:
PN(" class='ocr_line' title='%s'>"%info)
# loop through all the characters in the line, breaking to make a new ocr_word
# when we come to a space
# use the line coordinates to seed the word coordinates. In particular the y values
# are always used for word bounding boxes
word_x0 = x0
word_y0 = y0
word_x1 = x0
word_y1 = y1
current_word = u""
# the last element is a special case, so we run a counter to be able to detect it
char_coords_count = len(char_coords)
count = 0
# keep track of the last char's x in order to put the word x boundary in the middle of the
# space between words. This ensures that no part of the word gets omitted from the bbox
previous_char_x = 0
previous_char = u""
for char_coord in char_coords:
current_char, char_x = char_coord
if (current_char == u" "):
midpoint = (char_x + previous_char_x) / 2
word_x1 = char_x + x0
word_info="bbox %d %d %d %d"%(word_x0,word_y0,word_x1,word_y1)
PN("<span class='ocr_word' title='%s'>"%word_info,current_word,"</span> ")
# set the beginning x of the next word to the ending x of this one
word_x0 = word_x1
# reset the accumulated characters in the word
current_word = u""
elif (count == (char_coords_count-1)):
# in the case of the last character in the line:
# 1) the *line's* greatest x value is used as this element's, too
word_info="bbox %d %d %d %d"%(word_x0,word_y0,x1,word_y1)
# 2) no space is put after the word span
PN("<span class='ocr_word' title='%s'>"%word_info,current_word+current_char,"</span>")
else:
# if the current character is not a space, then append it to current word
# which will be outputted in the <span class="ocr_word" when a space *is*
# encountered
current_word = current_word + current_char
count = count + 1
previous_char_x = char_x
previous_char = current_char
PN("</span>")
except:
E("Data for ocr_word elements is not available. Did you select --llocs in ocropus-rpred?")
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation looks off on GitHub.

else:
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
if not args.nobreaks: P("<br />")
else: P()

Expand Down
6 changes: 3 additions & 3 deletions ocropus-rtrain
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,9 @@ for trial in range(start,args.ntrain):
gta = "".join(codec.decode(acs))
if not args.quiet:
print("%d %.2f %s" % (trial, network.error, line.shape), fname)
print(" TRU:", repr(transcript))
print(" ALN:", repr(gta[:len(transcript)+5]))
print(" OUT:", repr(pred[:len(transcript)+5]))
print(" TRU:", transcript)
print(" ALN:", gta[:len(transcript)+5])
print(" OUT:", pred[:len(transcript)+5])

pred = re.sub(' ','_',pred)
gta = re.sub(' ','_',gta)
Expand Down