ocropus-archive · jze · Nov 19, 2015 · Nov 19, 2015 · Nov 19, 2015 · Dec 4, 2015
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ build/
 *.os
 *.a
 *.so
+.idea/
diff --git a/ocropus-genauigkeit.sh b/ocropus-genauigkeit.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+MODEL=$1
+
+if [ ! -e "$MODEL" ]; then
+	echo "USAGE: ocropus-genauigkeit.sh <MODEL-FILE>"
+	exit 1
+fi
+
+if [ ! -e check.txt ]; then
+	echo "Missing file 'check.txt' that contains the names of all directories (pages) used for the test."
+fi
+
+
+TXT=""
+GT=""
+PNG=""
+for d in  `cat check.txt`; do
+	TXT="$TXT $d/??????.txt"
+	GT="$GT $d/*.gt.txt"
+	PNG="$PNG $d/*.bin.png"
+done
+
+rm $TXT
+ocropus-rpred -Q 6 -m $MODEL $PNG
+#ocropus-econf $GT
+#WITHDIFF=`ocropus-errs $GT 2>/dev/null |grep -v "^     0" | cut -b 15- | sed 's/.gt.txt/.bin.png/' `
+ocropus-errs -e $GT 
+#ocropus-gtedit html $WITHDIFF
diff --git a/ocropus-gpageseg b/ocropus-gpageseg
@@ -149,7 +149,7 @@ def DSAVE(title,image):
 def compute_separators_morph(binary,scale):
     """Finds vertical black lines corresponding to column separators."""
     d0 = int(max(5,scale/4))
-    d1 = int(max(5,scale))+args.sepwiden
+    d1 = int(max(5,scale/4)) + args.sepwiden
     thick = morph.r_dilation(binary,(d0,d1))
     vert = morph.rb_opening(thick,(10*scale,1))
     vert = morph.r_erosion(vert,(d0//2,args.sepwiden))
@@ -205,7 +205,7 @@ def compute_colseps_conv(binary,scale=1.0):
     grad = (grad>0.5*amax(grad))
     DSAVE("2grad",grad)
     # combine edges and whitespace
-    seps = minimum(thresh,maximum_filter(grad,(int(scale),int(5*scale))))
+    seps = minimum(thresh,maximum_filter(grad,(int(5*scale),int(5*scale))))
     seps = maximum_filter(seps,(int(2*scale),1))
     DSAVE("3seps",seps)
     # select only the biggest column separators

diff --git a/ocropus-linegen b/ocropus-linegen
@@ -5,7 +5,7 @@ import random as pyrandom
 import glob,sys,os,re,codecs,traceback
 from pylab import *
 from PIL import Image
-import ImageFont,ImageDraw
+from PIL import ImageFont,ImageDraw
 from scipy.ndimage import filters,measurements,interpolation
 from scipy.misc import imsave
 import ocrolib

diff --git a/ocropus-svg b/ocropus-svg
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+import __builtin__ as python
+import random as pyrandom
+import sys,os,re,glob,argparse,codecs
+from pylab import median, imread
+
+import ocrolib
+
+parser = argparse.ArgumentParser("""
+Construct an SVG output file by putting together
+the recognition results for each page.
+You should usually invoke this program as 
+
+    ocropus-svg 'book/????.bin.png'
+
+For each page like 'book/0001.bin.png', it uses the following files:
+
+    book/0001.bin.png            # page image
+    book/0001.pseg.png           # page segmentation
+    book/0001/010001.txt         # recognizer output for lines
+""")
+parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
+parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
+parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
+parser.add_argument("-o","--output",default="book.svg",help="output file, default: %(default)s")
+parser.add_argument('files',nargs='+')
+args = parser.parse_args()
+args.files = ocrolib.glob_all(args.files)
+
+def E(*args):
+    args = [str(x) for x in args]
+    sys.stderr.write(" ".join(args))
+    sys.stderr.write("\n")
+def P(*args):
+    ostream.write("".join(args)+"\n")
+def PN(*args):
+    ostream.write("".join(args))
+
+median_xheight = None
+dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
+xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
+if len(xhfiles)>5:
+    xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
+    if len(xheights)>0:
+        median_xheight = median(xheights)
+else:    
+    lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
+    pyrandom.shuffle(lfiles)
+    if len(lfiles)>0:
+        median_xheight = 0.5*median([imread(f).shape[0] for f in lfiles[:100]])
+E("median_xheight",median_xheight)
+
+last_coords = None
+
+for arg in args.files:
+    base,_ = ocrolib.allsplitext(arg)
+    try:
+        E("===",arg)
+
+        # to proceed, we need a pseg file and a
+        # subdirectory containing text lines
+
+        if not os.path.exists(base+".pseg.png"):
+            E("%s: no such file"%(base+".pseg.png",))
+            continue
+
+        if not os.path.isdir(base):
+            E("%s: no such directory"%base)
+            continue
+
+        # iterate through the text lines in reading order, based
+        # on the page segmentation file
+
+        pseg = ocrolib.read_page_segmentation(base+".pseg.png")
+        height = pseg.shape[0]
+        width = pseg.shape[1]
+
+        ostream = codecs.open(base+".svg","w","utf-8")
+        P("<svg xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns='http://www.w3.org/2000/svg'>")
+        P("<image y='0' x='0' height='%d' width='%d' xlink:href='../%s' />"%(height,width,arg))
+
+        regions = ocrolib.RegionExtractor()
+        regions.setPageLines(pseg)
+        for i in range(1,regions.length()):
+
+            # keep track of the bounding box information for each line
+            # and insert paragraph breaks as needed
+
+            id = regions.id(i)
+            y0,x0,y1,x1 = regions.bboxMath(i)
+            if last_coords is not None:
+                lx0,ly0 = last_coords
+                dx,dy = x0-lx0,y1-ly0
+                par = 0
+                if dy>0: 
+                    par = 0 # column break... moving upwards
+                else:
+                    if median_xheight is not None:
+                        if abs(dy)>5*median_xheight: par = 1 # whitespace separator
+                        if dx>2*median_xheight: par = 1 # indented paragraph
+                        if abs(dx)>10*median_xheight: par = 1 # something else
+            last_coords = (x0,y0)
+
+            # get the text for the line itself
+
+            lbase = "%s/%06x"%(base,id)
+
+            if not os.path.exists(lbase+".txt"):
+                E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
+                continue
+
+            text = ocrolib.read_text(lbase+".txt")
+
+            text = re.sub(r'\&','\&amp;',text)
+            text = re.sub(r'\<','\&lt;',text)
+
+            # accumulate information for each line here
+
+            style = ""
+            info = ""
+
+            # estimate the font size for this line
+            if median_xheight is not None and os.path.exists(lbase+".xheight"):
+                font_size = float(ocrolib.read_text(lbase+".xheight"))
+            elif median_xheight is not None:
+				font_size = median_xheight
+            else:
+				font_size = 10
+
+            # put it all together into RECT + TEXT
+
+            P("<g><rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" style=\"fill:none; stroke:blue;\"/>"%(x0,height-y1,x1-x0,y1-y0))
+            P("<text x='%d' y='%d' style='font-size:%d;' class='ocr_line'>%s</text></g>"%(x0,height-y1+font_size,font_size, text));
+
+    finally:
+        P("</svg>")
+        ostream.close()
+
+
+
diff --git a/ocropus-trainieren.sh b/ocropus-trainieren.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+for d in  `cat train.txt`; do
+	ARGS="$ARGS $d/*.bin.png"
+done
+
+ocropus-rtrain --load ~/sandbox/ocropy/models/en-default.pyrnn.gz -F 1000 -o marine2wk $ARGS
diff --git a/ocropus-trainierte-zeichen-zeigen.sh b/ocropus-trainierte-zeichen-zeigen.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+cat train.txt | while read d; do cat $d/*.gt.txt; done | sed 's/\(.\)/\1\n/g'| sort | uniq -c
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ build/ @@
     *.os
     *.a
     *.so
+    .idea/
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/sh

		cat train.txt \| while read d; do cat $d/*.gt.txt; done \| sed 's/\(.\)/\1\n/g'\| sort \| uniq -c