-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_stations.py
127 lines (110 loc) · 4.09 KB
/
get_stations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import cv2
import numpy as np
import argparse
import os
import pytesseract
from PIL import Image
def get_num_columns(tess_data, counter):
"""
Get the number of columns in a table
====================================
inputs:
tess_data = output of tesseract on the whole image
counter = index of table in page
====================================
Output:
length: number of columns
"""
len_dict = np.zeros(20).tolist()
max_count = 1
for j in tess_data[counter - 1]:
len_dict[len(j.split(","))] += 1
for j in range(20):
if max_count < len_dict[j]:
max_count = len_dict[j]
length = j
return length
def tables_from_contours(img,cnt):
"""
From the contours generated from preprocessing
Identify the tables and stations regions in the image
===============================================
Inputs:
img (numpy array): image
cnt (list): detected contours in image
===============================================
Outputs:
tables (numpy array): Table arrays
stations_regions (numpy array): array of station region
"""
tables = []
stations_regions = []
for i in range(len(cnt)):
area = cv2.contourArea(cnt[i])
# Extract the regions containing the images
if area > 100000:
mask = np.zeros_like(img)
cv2.drawContours(mask, cnt, i, 255, -1)
# get rectangle coordinates from the contours
x, y, w, h = cv2.boundingRect(cnt[i])
# crop out the table region from the image
crop = img[y:h + y, x:w + x]
masked_img = cv2.bitwise_and(img, img, mask=mask)
masked_img[mask == 0] = 255
# cut of the upper part of the image containing stations
crop_left = img[y - 200:y, x:w + x]
boxes = pytesseract.image_to_boxes(crop) # also include any config options you use
tables.append(crop)
stations_regions.append(crop_left)
tables.reverse()
stations_regions.reverse()
return tables, stations_regions
def get_station_names(input_image, station_init):
"""
Returns the name of the stations in each table
==============================================
Inputs:
input_image(numpy array): image
station_init(list) : formatted output from tesseract
"""
config = "--psm 6 --oem 2 eng -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-./"
img = cv2.imread(input_image, 0)
PILimg = Image.fromarray(img)
old_height, old_width = PILimg.size
h, w = img.shape[:2]
kernel = np.ones((15, 15), np.uint8)
e = cv2.erode(img, kernel, iterations=2)
d = cv2.dilate(e, kernel, iterations=1)
ret, th = cv2.threshold(d, 150, 255, cv2.THRESH_BINARY_INV)
mask = np.zeros((h + 2, w + 2), np.uint8)
# pick out the region containing the table using flood fill
cv2.floodFill(th, mask, (200, 200), 255) # position = (200,200)
# invert colors
out = cv2.bitwise_not(th)
out = cv2.dilate(out, kernel, iterations=3)
# Edge detection
cnt, h = cv2.findContours(out, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
min_area = -1
table_stations = []
stations = ""
tables, station_regions = tables_from_contours(img, cnt)
for i, (table, station_region) in enumerate(zip(tables, station_regions)):
length = get_num_columns(station_init, i)
length = 10 if length < 8 else length
width_cutoff = int(table.shape[1] / length)
# cutoff image regions for different stations
for i in range(1, length + 1):
if i != 1:
start = (i - 1) * width_cutoff - 50
else:
start = (i - 1) * width_cutoff
s = station_region[:, start: i * width_cutoff]
text = pytesseract.image_to_string(s, config=config)
text = text.split('\n')[0]
try:
stations += text.strip() + ","
except:
stations += "N/A,"
table_stations.append(stations)
stations = ""
return table_stations[::-1]