-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathmain.py
107 lines (84 loc) · 2.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import glob
import mimetypes
import os
from os import environ as env
from typing import List
import requests
from dotenv import load_dotenv
from tqdm import tqdm
from classes import Table, Text, read_tables_from_list, read_texts_from_list
from utility import dump_excel, load_json, merge_text_table, timeit
# load env var
load_dotenv()
TABLE_DETECTION_PORT = env["TABLE_DETECTION_PORT"]
TABLE_RECOGNITION_PORT = env["TABLE_RECOGNITION_PORT"]
TEXT_DETECTION_PORT = env["TEXT_DETECTION_PORT"]
TEXT_RECOGNITION_PORT = env["TEXT_RECOGNITION_PORT"]
@timeit
def get_table(image_path):
image_name = os.path.basename(image_path)
url = f"http://localhost:{TABLE_RECOGNITION_PORT}/ai/infer"
files = [
(
"file",
(image_name, open(image_path, "rb"), mimetypes.guess_type(image_path)[0]),
)
]
response = requests.request("POST", url, files=files)
return response.json()
@timeit
def get_ocr(image_path):
image_name = os.path.basename(image_path)
url = f"http://localhost:{TEXT_RECOGNITION_PORT}/ai/infer"
files = [
(
"file",
(image_name, open(image_path, "rb"), mimetypes.guess_type(image_path)[0]),
)
]
response = requests.request("POST", url, files=files)
return response.json()
@timeit
def main():
# image_path = "/home/luan/research/Go5-Project/sample.jpg"
# # read table
# output: List = get_table(image_path)
# print(output)
# tables: List[Table] = read_tables_from_list(output)
# # read text
# output: List = get_ocr(image_path)
# texts: List[Text] = read_texts_from_list(output)
# merge_text_table(tables, texts)
# # indexing
# for t in tables:
# t.indexing()
# # image = cv2.imread(image_path)
# # show(draw(image, tables))
#
# dump_excel(tables[1], "debug.xlsx")
for image_path in tqdm(
glob.glob("/home/luan/research/Go5-Project/data/images/*.jpg")
):
if "eu-0050002" not in image_path:
continue
image_name = os.path.basename(image_path)
file_name = os.path.splitext(image_name)[0]
# read table
output: List = load_json(
f"/home/luan/research/Go5-Project/cache/table/{file_name}.json"
)
tables: List[Table] = read_tables_from_list(output)
# read text
output: List = load_json(
f"/home/luan/research/Go5-Project/cache/ocr/{file_name}.json"
)
texts: List[Text] = read_texts_from_list(output)
merge_text_table(tables, texts)
# indexing
for t in tables:
t.indexing()
dump_excel(tables, "debug.xlsx")
# image = cv2.imread(image_path)
# cv2.imwrite(f"debug/{image_name}", draw(image, tables))
if __name__ == "__main__":
main()