ocropus-archive · kba · Dec 4, 2015
diff --git a/ocropus-svg b/ocropus-svg
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+import __builtin__ as python
+import random as pyrandom
+import sys,os,re,glob,argparse,codecs
+from pylab import median, imread
+
+import ocrolib
+
+parser = argparse.ArgumentParser("""
+Construct an SVG output file by putting together
+the recognition results for each page.
+You should usually invoke this program as 
+
+    ocropus-svg 'book/????.bin.png'
+
+For each page like 'book/0001.bin.png', it uses the following files:
+
+    book/0001.bin.png            # page image
+    book/0001.pseg.png           # page segmentation
+    book/0001/010001.txt         # recognizer output for lines
+""")
+parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
+parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
+parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
+parser.add_argument("-o","--output",default="book.svg",help="output file, default: %(default)s")
+parser.add_argument('files',nargs='+')
+args = parser.parse_args()
+args.files = ocrolib.glob_all(args.files)
+
+def E(*args):
+    args = [str(x) for x in args]
+    sys.stderr.write(" ".join(args))
+    sys.stderr.write("\n")
+def P(*args):
+    ostream.write("".join(args)+"\n")
+def PN(*args):
+    ostream.write("".join(args))
+
+median_xheight = None
+dirs = [ocrolib.allsplitext(name)[0] for name in args.files]
+xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[])
+if len(xhfiles)>5:
+    xheights = [float(ocrolib.read_text(f)) for f in xhfiles]
+    if len(xheights)>0:
+        median_xheight = median(xheights)
+else:    
+    lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[])
+    pyrandom.shuffle(lfiles)
+    if len(lfiles)>0:
+        median_xheight = 0.5*median([imread(f).shape[0] for f in lfiles[:100]])
+E("median_xheight",median_xheight)
+
+last_coords = None
+
+for arg in args.files:
+    base,_ = ocrolib.allsplitext(arg)
+    try:
+        E("===",arg)
+
+        # to proceed, we need a pseg file and a
+        # subdirectory containing text lines
+
+        if not os.path.exists(base+".pseg.png"):
+            E("%s: no such file"%(base+".pseg.png",))
+            continue
+
+        if not os.path.isdir(base):
+            E("%s: no such directory"%base)
+            continue
+
+        # iterate through the text lines in reading order, based
+        # on the page segmentation file
+
+        pseg = ocrolib.read_page_segmentation(base+".pseg.png")
+        height = pseg.shape[0]
+        width = pseg.shape[1]
+
+        ostream = codecs.open(base+".svg","w","utf-8")
+        P("<svg xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns='http://www.w3.org/2000/svg'>")
+        P("<image y='0' x='0' height='%d' width='%d' xlink:href='../%s' />"%(height,width,arg))
+
+        regions = ocrolib.RegionExtractor()
+        regions.setPageLines(pseg)
+        for i in range(1,regions.length()):
+
+            # keep track of the bounding box information for each line
+            # and insert paragraph breaks as needed
+
+            id = regions.id(i)
+            y0,x0,y1,x1 = regions.bboxMath(i)
+            if last_coords is not None:
+                lx0,ly0 = last_coords
+                dx,dy = x0-lx0,y1-ly0
+                par = 0
+                if dy>0: 
+                    par = 0 # column break... moving upwards
+                else:
+                    if median_xheight is not None:
+                        if abs(dy)>5*median_xheight: par = 1 # whitespace separator
+                        if dx>2*median_xheight: par = 1 # indented paragraph
+                        if abs(dx)>10*median_xheight: par = 1 # something else
+            last_coords = (x0,y0)
+
+            # get the text for the line itself
+
+            lbase = "%s/%06x"%(base,id)
+
+            if not os.path.exists(lbase+".txt"):
+                E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png"))
+                continue
+
+            text = ocrolib.read_text(lbase+".txt")
+
+            text = re.sub(r'\&','\&amp;',text)
+            text = re.sub(r'\<','\&lt;',text)
+
+            # accumulate information for each line here
+
+            style = ""
+            info = ""
+
+            # estimate the font size for this line
+            if median_xheight is not None and os.path.exists(lbase+".xheight"):
+                font_size = float(ocrolib.read_text(lbase+".xheight"))
+            elif median_xheight is not None:
+				font_size = median_xheight
+            else:
+				font_size = 10
+
+            # put it all together into RECT + TEXT
+
+            P("<g><rect x=\"%d\" y=\"%d\" width=\"%d\" height=\"%d\" style=\"fill:none; stroke:blue;\"/>"%(x0,height-y1,x1-x0,y1-y0))
+            P("<text x='%d' y='%d' style='font-size:%d;' class='ocr_line'>%s</text></g>"%(x0,height-y1+font_size,font_size, text));
+
+    finally:
+        P("</svg>")
+        ostream.close()
+
+
+