Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

smaller vertical separators #68

Closed
wants to merge 10 commits into from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ build/
*.os
*.a
*.so
.idea/
28 changes: 28 additions & 0 deletions ocropus-genauigkeit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/sh
MODEL=$1

if [ ! -e "$MODEL" ]; then
echo "USAGE: ocropus-genauigkeit.sh <MODEL-FILE>"
exit 1
fi

if [ ! -e check.txt ]; then
echo "Missing file 'check.txt' that contains the names of all directories (pages) used for the test."
fi


TXT=""
GT=""
PNG=""
for d in `cat check.txt`; do
TXT="$TXT $d/??????.txt"
GT="$GT $d/*.gt.txt"
PNG="$PNG $d/*.bin.png"
done

rm $TXT
ocropus-rpred -Q 6 -m $MODEL $PNG
#ocropus-econf $GT
#WITHDIFF=`ocropus-errs $GT 2>/dev/null |grep -v "^ 0" | cut -b 15- | sed 's/.gt.txt/.bin.png/' `
ocropus-errs -e $GT
#ocropus-gtedit html $WITHDIFF
4 changes: 2 additions & 2 deletions ocropus-gpageseg
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def DSAVE(title,image):
def compute_separators_morph(binary,scale):
"""Finds vertical black lines corresponding to column separators."""
d0 = int(max(5,scale/4))
d1 = int(max(5,scale))+args.sepwiden
d1 = int(max(5,scale/4)) + args.sepwiden
thick = morph.r_dilation(binary,(d0,d1))
vert = morph.rb_opening(thick,(10*scale,1))
vert = morph.r_erosion(vert,(d0//2,args.sepwiden))
Expand Down Expand Up @@ -205,7 +205,7 @@ def compute_colseps_conv(binary,scale=1.0):
grad = (grad>0.5*amax(grad))
DSAVE("2grad",grad)
# combine edges and whitespace
seps = minimum(thresh,maximum_filter(grad,(int(scale),int(5*scale))))
seps = minimum(thresh,maximum_filter(grad,(int(5*scale),int(5*scale))))
seps = maximum_filter(seps,(int(2*scale),1))
DSAVE("3seps",seps)
# select only the biggest column separators
Expand Down
2 changes: 1 addition & 1 deletion ocropus-linegen
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import random as pyrandom
import glob,sys,os,re,codecs,traceback
from pylab import *
from PIL import Image
import ImageFont,ImageDraw
from PIL import ImageFont,ImageDraw
from scipy.ndimage import filters,measurements,interpolation
from scipy.misc import imsave
import ocrolib
Expand Down
141 changes: 141 additions & 0 deletions ocropus-svg
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python

import __builtin__ as python
import random as pyrandom
import sys,os,re,glob,argparse,codecs
from pylab import median, imread

import ocrolib

parser = argparse.ArgumentParser("""
Construct an SVG output file by putting together
the recognition results for each page.
You should usually invoke this program as

ocropus-svg 'book/????.bin.png'

For each page like 'book/0001.bin.png', it uses the following files:

book/0001.bin.png # page image
book/0001.pseg.png # page segmentation
book/0001/010001.txt # recognizer output for lines
""")
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.svg",help="output file, default: %(default)s")
parser.add_argument('files',nargs='+')
args = parser.parse_args()
args.files = ocrolib.glob_all(args.files)

def E(*args):
args = [str(x) for x in args]
sys.stderr.write(" ".join(args))
sys.stderr.write("\n")
def P(*args):
ostream.write("".join(args)+"\n")
def PN(*args):
ostream.write("".join(args))

median_xheight = None
dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
if len(xhfiles)>5:
xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
if len(xheights)>0:
median_xheight = median(xheights)
else:
lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
pyrandom.shuffle(lfiles)
if len(lfiles)>0:
median_xheight = 0.5*median([imread(f).shape[0] for f in lfiles[:100]])
E("median_xheight",median_xheight)

last_coords = None

for arg in args.files:
base,_ = ocrolib.allsplitext(arg)
try:
E("===",arg)

# to proceed, we need a pseg file and a
# subdirectory containing text lines

if not os.path.exists(base+".pseg.png"):
E("%s: no such file"%(base+".pseg.png",))
continue

if not os.path.isdir(base):
E("%s: no such directory"%base)
continue

# iterate through the text lines in reading order, based
# on the page segmentation file

pseg = ocrolib.read_page_segmentation(base+".pseg.png")
height = pseg.shape[0]
width = pseg.shape[1]

ostream = codecs.open(base+".svg","w","utf-8")
P("<svg xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns='http://www.w3.org/2000/svg'>")
P("<image y='0' x='0' height='%d' width='%d' xlink:href='../%s' />"%(height,width,arg))

regions = ocrolib.RegionExtractor()
regions.setPageLines(pseg)
for i in range(1,regions.length()):

# keep track of the bounding box information for each line
# and insert paragraph breaks as needed

id = regions.id(i)
y0,x0,y1,x1 = regions.bboxMath(i)
if last_coords is not None:
lx0,ly0 = last_coords
dx,dy = x0-lx0,y1-ly0
par = 0
if dy>0:
par = 0 # column break... moving upwards
else:
if median_xheight is not None:
if abs(dy)>5*median_xheight: par = 1 # whitespace separator
if dx>2*median_xheight: par = 1 # indented paragraph
if abs(dx)>10*median_xheight: par = 1 # something else
last_coords = (x0,y0)

# get the text for the line itself

lbase = "%s/%06x"%(base,id)

if not os.path.exists(lbase+".txt"):
E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
continue

text = ocrolib.read_text(lbase+".txt")

text = re.sub(r'\&','\&amp;',text)
text = re.sub(r'\<','\&lt;',text)

# accumulate information for each line here

style = ""
info = ""

# estimate the font size for this line
if median_xheight is not None and os.path.exists(lbase+".xheight"):
font_size = float(ocrolib.read_text(lbase+".xheight"))
elif median_xheight is not None:
font_size = median_xheight
else:
font_size = 10

# put it all together into RECT + TEXT

P("<g><rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" style=\"fill:none; stroke:blue;\"/>"%(x0,height-y1,x1-x0,y1-y0))
P("<text x='%d' y='%d' style='font-size:%d;' class='ocr_line'>%s</text></g>"%(x0,height-y1+font_size,font_size, text));

finally:
P("</svg>")
ostream.close()



7 changes: 7 additions & 0 deletions ocropus-trainieren.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

for d in `cat train.txt`; do
ARGS="$ARGS $d/*.bin.png"
done

ocropus-rtrain --load ~/sandbox/ocropy/models/en-default.pyrnn.gz -F 1000 -o marine2wk $ARGS
3 changes: 3 additions & 0 deletions ocropus-trainierte-zeichen-zeigen.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh

cat train.txt | while read d; do cat $d/*.gt.txt; done | sed 's/\(.\)/\1\n/g'| sort | uniq -c