-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathscript.py
More file actions
110 lines (110 loc) · 4.18 KB
/
script.py
File metadata and controls
110 lines (110 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import cv2
import matplotlib.pyplot as plt
file=r'test_table_img.png'
img = cv2.imread(file,0)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
img_bin2 = 255-img
thresh1,img_bin_otsu = cv2.threshold(img_bin2,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)
import numpy as np
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, np.array(img).shape[1]//100))
eroded_image = cv2.erode(img_bin_otsu, vertical_kernel, iterations=3)
vertical_lines = cv2.dilate(eroded_image, vertical_kernel, iterations=3)
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (np.array(img).shape[1]//100, 1))
horizontal_lines = cv2.erode(img_bin_otsu, hor_kernel, iterations=5)
horizontal_lines = cv2.dilate(horizontal_lines, hor_kernel, iterations=5)
vertical_horizontal_lines = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
vertical_horizontal_lines = cv2.erode(~vertical_horizontal_lines, kernel, iterations=3)
thresh, vertical_horizontal_lines = cv2.threshold(vertical_horizontal_lines,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
bitxor = cv2.bitwise_xor(img,vertical_horizontal_lines)
bitnot = cv2.bitwise_not(bitxor)
import pytesseract
contours, hierarchy = cv2.findContours(vertical_horizontal_lines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
boundingBoxes = [cv2.boundingRect(contour) for contour in contours]
(contours, boundingBoxes) = zip(*sorted(zip(contours, boundingBoxes),key=lambda x:x[1][1]))
boxes = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
if (w<1000 and h<500):
image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
boxes.append([x,y,w,h])
plotting = plt.imshow(image,cmap='gray')
plt.title("Identified contours")
plt.show()
rows=[]
columns=[]
heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
mean = np.mean(heights)
print(mean)
columns.append(boxes[0])
previous=boxes[0]
for i in range(1,len(boxes)):
if(boxes[i][1]<=previous[1]+mean/2):
columns.append(boxes[i])
previous=boxes[i]
if(i==len(boxes)-1):
rows.append(columns)
else:
rows.append(columns)
columns=[]
previous = boxes[i]
columns.append(boxes[i])
print("Rows")
for row in rows:
print(row)
total_cells=0
for i in range(len(row)):
if len(row[i]) > total_cells:
total_cells = len(row[i])
print(total_cells)
center = [int(rows[i][j][0]+rows[i][j][2]/2) for j in range(len(rows[i])) if rows[0]]
print(center)
center=np.array(center)
center.sort()
print(center)
boxes_list = []
for i in range(len(rows)):
l=[]
for k in range(total_cells):
l.append([])
for j in range(len(rows[i])):
diff = abs(center-(rows[i][j][0]+rows[i][j][2]/4))
minimum = min(diff)
indexing = list(diff).index(minimum)
l[indexing].append(rows[i][j])
boxes_list.append(l)
for box in boxes_list:
print(box)
dataframe_final=[]
for i in range(len(boxes_list)):
for j in range(len(boxes_list[i])):
s=''
if(len(boxes_list[i][j])==0):
dataframe_final.append(' ')
else:
for k in range(len(boxes_list[i][j])):
y,x,w,h = boxes_list[i][j][k][0],boxes_list[i][j][k][1], boxes_list[i][j][k][2],boxes_list[i][j][k][3]
roi = bitnot[x:x+h, y:y+w]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
border = cv2.copyMakeBorder(roi,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
dilation = cv2.dilate(resizing, kernel,iterations=1)
erosion = cv2.erode(dilation, kernel,iterations=2)
out = pytesseract.image_to_string(erosion)
if(len(out)==0):
out = pytesseract.image_to_string(erosion)
s = s +" "+ out
dataframe_final.append(s)
print(dataframe_final)
arr = np.array(dataframe_final)
import pandas as pd
dataframe = pd.DataFrame(arr.reshape(len(rows), total_cells))
data = dataframe.style.set_properties(align="left")
#print(data)
#print(dataframe)
d=[]
for i in range(0,len(rows)):
for j in range(0,total_cells):
print(dataframe[i][j],end=" ")
print()
print(dataframe)
dataframe.to_csv("output1.csv",encoding="utf-8-sig")