-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpic_to_df_E.py
104 lines (87 loc) · 3.64 KB
/
pic_to_df_E.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
__author__ = """Nripesh Niketan"""
__email__ = """nripesh.niketan@emerson.com"""
import time
import cv2
import pytesseract
import pandas as pd
import re
import os
import concurrent.futures
import shutil
import warnings
import numpy as np
warnings.filterwarnings("ignore")
# Set the path to the tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'tesseract\tesseract.exe'
global skipped
skipped = pd.DataFrame(columns=['skipped'])
def read_image(file, skipped=skipped):
im = cv2.imread(file)
# Preprocess the image
im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
# Remove shadows
dilated_img = cv2.dilate(im_gray, np.ones((5,5), np.uint8))
bg_img = cv2.medianBlur(dilated_img, 21)
im_no_shadow = 255 - cv2.absdiff(im_gray, bg_img)
im_thresh = cv2.threshold(im_no_shadow, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# Output the preprocessed image
cv2.imwrite('preprocessed.png', im_thresh)
# Extract text with pytesseract
custom_config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(im_thresh, config=custom_config)
# # Detect text blocks
# d = pytesseract.image_to_data(im_thresh, config=custom_config, output_type=pytesseract.Output.DICT)
# n_boxes = len(d['level'])
# for i in range(n_boxes):
# (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
# cv2.rectangle(im, (x, y), (x + w, y + h), (0, 255, 0), 2)
# # Show image with detected text
# cv2.imshow('img', im)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
# Process the text and create a DataFrame
rows = text.strip().split('\n')
header, data = rows[0].split(), rows[1:]
print(header)
locators, lines, item_codes, qtys = [], [], [], []
for row in data:
values = re.split(r'\s+', row) # Split on one or more whitespace characters
if len(values) == 4:
locator, line, item_code, qty = values
locators.append(locator)
lines.append(line)
item_codes.append(item_code)
qtys.append(qty)
# if len(values) == 3:
# locator, item_code, qty = values
# locators.append(locator)
# lines.append(None)
# item_codes.append(item_code)
# qtys.append(qty)
else:
# append the skipped rows to the skipped dataframe
skipped = skipped.append({'skipped': row}, ignore_index=True)
df = pd.DataFrame({'Locator': locators, 'item_code': item_codes, 'qty': qtys, 'line': lines, 'file': file})
return df, skipped
def read_images_in_folder(folder):
global skipped
df = pd.DataFrame()
files = [os.path.join(folder, file) for file in os.listdir(folder) if file.endswith(".png") or file.endswith(".jpg")]
processed_files_folder = os.path.join('processed_files')
if not os.path.exists(processed_files_folder):
os.makedirs(processed_files_folder)
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(read_image, files)
for file, result in zip(files, results):
dataframe, new_skipped = result
df = df.append(dataframe, ignore_index=True)
skipped = skipped.append(new_skipped, ignore_index=True)
destination = os.path.join(processed_files_folder, os.path.basename(file))
shutil.move(file, destination)
return df
if __name__ == '__main__':
folder = r'new_images'
df = read_images_in_folder(folder)
with pd.ExcelWriter('output.xlsx') as writer:
df.to_excel(writer, sheet_name='data', index=False)
skipped.to_excel(writer, sheet_name='skipped', index=False)