Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Construct SVG output files by putting together the recognition results for each page #139

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions ocropus-svg
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python

import __builtin__ as python
import random as pyrandom
import sys,os,re,glob,argparse,codecs
from pylab import median, imread

import ocrolib

parser = argparse.ArgumentParser("""
Construct an SVG output file by putting together
the recognition results for each page.
You should usually invoke this program as

ocropus-svg 'book/????.bin.png'

For each page like 'book/0001.bin.png', it uses the following files:

book/0001.bin.png # page image
book/0001.pseg.png # page segmentation
book/0001/010001.txt # recognizer output for lines
""")
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.svg",help="output file, default: %(default)s")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)

def E(*args):
args = [str(x) for x in args]
sys.stderr.write(" ".join(args))
sys.stderr.write("\n")
def P(*args):
ostream.write("".join(args)+"\n")
def PN(*args):
ostream.write("".join(args))

median_xheight = None
dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
if len(xhfiles)>5:
xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
if len(xheights)>0:
median_xheight = median(xheights)
else:
lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
pyrandom.shuffle(lfiles)
if len(lfiles)>0:
median_xheight = 0.5*median([imread(f).shape[0] for f in lfiles[:100]])
E("median_xheight",median_xheight)

last_coords = None

for arg in args.files:
base,_ = ocrolib.allsplitext(arg)
try:
E("===",arg)

# to proceed, we need a pseg file and a
# subdirectory containing text lines

if not os.path.exists(base+".pseg.png"):
E("%s: no such file"%(base+".pseg.png",))
continue

if not os.path.isdir(base):
E("%s: no such directory"%base)
continue

# iterate through the text lines in reading order, based
# on the page segmentation file

pseg = ocrolib.read_page_segmentation(base+".pseg.png")
height = pseg.shape[0]
width = pseg.shape[1]

ostream = codecs.open(base+".svg","w","utf-8")
P("<svg xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns='http://www.w3.org/2000/svg'>")
P("<image y='0' x='0' height='%d' width='%d' xlink:href='../%s' />"%(height,width,arg))

regions = ocrolib.RegionExtractor()
regions.setPageLines(pseg)
for i in range(1,regions.length()):

# keep track of the bounding box information for each line
# and insert paragraph breaks as needed

id = regions.id(i)
y0,x0,y1,x1 = regions.bboxMath(i)
if last_coords is not None:
lx0,ly0 = last_coords
dx,dy = x0-lx0,y1-ly0
par = 0
if dy>0:
par = 0 # column break... moving upwards
else:
if median_xheight is not None:
if abs(dy)>5*median_xheight: par = 1 # whitespace separator
if dx>2*median_xheight: par = 1 # indented paragraph
if abs(dx)>10*median_xheight: par = 1 # something else
last_coords = (x0,y0)

# get the text for the line itself

lbase = "%s/%06x"%(base,id)

if not os.path.exists(lbase+".txt"):
E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
continue

text = ocrolib.read_text(lbase+".txt")

text = re.sub(r'\&','\&amp;',text)
text = re.sub(r'\<','\&lt;',text)

# accumulate information for each line here

style = ""
info = ""

# estimate the font size for this line
if median_xheight is not None and os.path.exists(lbase+".xheight"):
font_size = float(ocrolib.read_text(lbase+".xheight"))
elif median_xheight is not None:
font_size = median_xheight
else:
font_size = 10

# put it all together into RECT + TEXT

P("<g><rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" style=\"fill:none; stroke:blue;\"/>"%(x0,height-y1,x1-x0,y1-y0))
P("<text x='%d' y='%d' style='font-size:%d;' class='ocr_line'>%s</text></g>"%(x0,height-y1+font_size,font_size, text));

finally:
P("</svg>")
ostream.close()