Skip to content

Commit 76a7f4a

Browse files
sunveilNastyBogetoksidgy
authored
TLDR-851 TLDR-861 Refactor table recognition (#508)
Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru> Co-authored-by: Belyaeva Oksana <belyaeva@ispras.ru>
1 parent e4ec06b commit 76a7f4a

28 files changed

+367
-840
lines changed

dedoc/api/api_args.py

-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ class QueryParameters:
2222
# tables handling
2323
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
2424
table_type: str = Form("", description="Pipeline mode for table recognition")
25-
orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
26-
orient_cell_angle: str = Form("90", enum=["90", "270"],
27-
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
2825

2926
# pdf handling
3027
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],

dedoc/api/web/index.html

+10-23
Original file line numberDiff line numberDiff line change
@@ -98,31 +98,9 @@ <h4>Attachments handling</h4>
9898
</details>
9999
</div>
100100

101-
102-
<div class="parameters">
103-
<h4>Tables handling </h4>
104-
<details><summary>need_pdf_table_analysis, orient_analysis_cells, orient_cell_angle</summary>
105-
<br>
106-
<p>
107-
<label>
108-
<input type="hidden" name="need_pdf_table_analysis" value="false">
109-
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
110-
</p>
111-
112-
<p>
113-
<label><input name="orient_analysis_cells" type="checkbox" value="true"> orient_analysis_cells</label>
114-
</p>
115-
116-
<p>
117-
<label>orient_cell_angle <input name="orient_cell_angle" type="number" size="5" value="90"></label>
118-
</p>
119-
</details>
120-
</div>
121-
122-
123101
<div class="parameters">
124102
<h4>PDF handling</h4>
125-
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
103+
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
126104
<br>
127105
<p>
128106
<label>
@@ -153,6 +131,15 @@ <h4>PDF handling</h4>
153131
</label>
154132
</p>
155133

134+
<details><summary>need_pdf_table_analysis</summary>
135+
<br>
136+
<p>
137+
<label>
138+
<input type="hidden" name="need_pdf_table_analysis" value="false">
139+
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
140+
</p>
141+
</details>
142+
156143
<p>
157144
<label>pages <input name="pages" type="text" size="8" value=":"></label>
158145
</p>

dedoc/data_structures/cell_with_meta.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,8 @@ def get_annotations(self) -> List[Annotation]:
4747
"""
4848
return LineWithMeta.join(lines=self.lines, delimiter="\n").annotations
4949

50-
@staticmethod
51-
def create_from_cell(cell: "CellWithMeta") -> "CellWithMeta":
52-
return CellWithMeta(lines=cell.lines, colspan=cell.colspan, rowspan=cell.rowspan, invisible=cell.invisible)
50+
def __str__(self) -> str:
51+
return f"CellWithMeta(cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
5352

5453
def to_api_schema(self) -> ApiCellWithMeta:
5554
import numpy as np
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,46 @@
1+
import copy
12
from typing import List, Optional
23

34
from dedocutils.data_structures import BBox
45

5-
from dedoc.data_structures.annotation import Annotation
66
from dedoc.data_structures.cell_with_meta import CellWithMeta
77
from dedoc.data_structures.line_with_meta import LineWithMeta
88

99

1010
class Cell(CellWithMeta):
1111

1212
@staticmethod
13-
def copy_from(cell: "Cell",
14-
x_top_left: Optional[int] = None,
15-
x_bottom_right: Optional[int] = None,
16-
y_top_left: Optional[int] = None,
17-
y_bottom_right: Optional[int] = None) -> "Cell":
18-
x_top_left = cell.x_top_left if x_top_left is None else x_top_left
19-
x_bottom_right = cell.x_bottom_right if x_bottom_right is None else x_bottom_right
20-
y_top_left = cell.y_top_left if y_top_left is None else y_top_left
21-
y_bottom_right = cell.y_bottom_right if y_bottom_right is None else y_bottom_right
22-
return Cell(x_top_left=x_top_left,
23-
x_bottom_right=x_bottom_right,
24-
y_top_left=y_top_left,
25-
y_bottom_right=y_bottom_right,
26-
id_con=cell.id_con,
27-
lines=cell.lines,
28-
is_attribute=cell.is_attribute,
29-
is_attribute_required=cell.is_attribute_required,
30-
rotated_angle=cell.rotated_angle,
31-
uid=cell.cell_uid,
32-
contour_coord=cell.con_coord)
13+
def copy_from(cell: "Cell", bbox: Optional[BBox] = None) -> "Cell":
14+
copy_cell = copy.deepcopy(cell)
15+
if bbox:
16+
copy_cell.bbox = bbox
17+
18+
return copy_cell
3319

3420
def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
3521
if self.lines:
3622
for line in self.lines:
3723
line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
38-
self.x_top_left += shift_x
39-
self.x_bottom_right += shift_x
40-
self.y_top_left += shift_y
41-
self.y_bottom_right += shift_y
42-
if self.con_coord:
43-
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
4424

45-
def __init__(self, x_top_left: int, x_bottom_right: int, y_top_left: int, y_bottom_right: int, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
46-
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: str = None,
47-
contour_coord: Optional[BBox] = None) -> None:
25+
self.bbox.shift(shift_x=shift_x, shift_y=shift_y)
26+
if self.contour_coord:
27+
self.contour_coord.shift(shift_x=shift_x, shift_y=shift_y)
4828

49-
import uuid
29+
def __init__(self, bbox: BBox, id_con: int = -1, lines: Optional[List[LineWithMeta]] = None,
30+
is_attribute: bool = False, is_attribute_required: bool = False, rotated_angle: int = 0, uid: Optional[str] = None,
31+
contour_coord: Optional[BBox] = None, colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None:
5032

51-
assert x_top_left <= x_bottom_right
52-
assert y_top_left <= y_bottom_right
33+
import uuid
5334

54-
self.lines = [] if lines is None else lines
55-
super().__init__(lines)
35+
super().__init__(lines=lines, colspan=colspan, rowspan=rowspan, invisible=invisible)
5636

57-
self.x_top_left = x_top_left
58-
self.x_bottom_right = x_bottom_right
59-
self.y_top_left = y_top_left
60-
self.y_bottom_right = y_bottom_right
37+
self.bbox = bbox
6138
self.id_con = id_con
6239
self.is_attribute = is_attribute
6340
self.is_attribute_required = is_attribute_required
6441
self.rotated_angle = rotated_angle
65-
self.cell_uid = f"cell_{uuid.uuid1()}" if uid is None else uid
66-
self.con_coord = contour_coord or BBox(0, 0, 0, 0)
67-
68-
def __str__(self) -> str:
69-
return f"Cell((cs={self.colspan}, rs={self.rowspan}, {self.get_text()})"
70-
71-
def get_text(self) -> str:
72-
return "\n".join([line.line for line in self.lines])
73-
74-
def get_annotations(self) -> List[Annotation]:
75-
return LineWithMeta.join(self.lines, delimiter="\n").annotations
42+
self.uuid = uuid.uuid4() if uuid is None else uid
43+
self.contour_coord = contour_coord or BBox(0, 0, 0, 0)
7644

7745
def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_height: int) -> None:
7846
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
@@ -96,11 +64,3 @@ def change_lines_boxes_page_width_height(self, new_page_width: int, new_page_hei
9664

9765
def __repr__(self) -> str:
9866
return self.__str__()
99-
100-
@property
101-
def width(self) -> int:
102-
return self.x_bottom_right - self.x_top_left
103-
104-
@property
105-
def height(self) -> int:
106-
return self.y_bottom_right - self.y_top_left

dedoc/readers/pdf_reader/data_classes/tables/scantable.py

+14-75
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, List, Optional
1+
from typing import List
22

33
from dedocutils.data_structures import BBox
44

@@ -9,106 +9,45 @@
99
from dedoc.readers.pdf_reader.data_classes.tables.location import Location
1010

1111

12-
class ScanTable:
13-
def __init__(self, page_number: int, matrix_cells: Optional[List[List[CellWithMeta]]] = None, bbox: Optional[BBox] = None,
14-
name: str = "", order: int = -1) -> None:
15-
self.matrix_cells = matrix_cells
16-
self.page_number = page_number
17-
self.locations = []
18-
self.name = name
12+
class ScanTable(Table):
13+
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
14+
15+
super().__init__(cells, TableMetadata(page_id=page_number))
1916
self.order = order
20-
if bbox is not None:
21-
self.locations.append(Location(page_number, bbox))
17+
self.locations = [Location(page_number, bbox)]
2218

2319
def extended(self, table: "ScanTable") -> None:
2420
# extend locations
2521
self.locations.extend(table.locations)
2622
# extend values
27-
self.matrix_cells.extend(table.matrix_cells)
23+
self.cells.extend(table.cells)
2824
# extend order
2925
self.order = max(self.order, table.order)
3026

3127
def check_on_cell_instance(self) -> bool:
32-
if len(self.matrix_cells) == 0:
28+
if len(self.cells) == 0:
3329
return False
34-
if len(self.matrix_cells[0]) == 0:
30+
if len(self.cells[0]) == 0:
3531
return False
36-
if not isinstance(self.matrix_cells[0][0], Cell):
32+
if not isinstance(self.cells[0][0], Cell):
3733
return False
3834
return True
3935

40-
def to_table(self) -> Table:
41-
metadata = TableMetadata(page_id=self.page_number, uid=self.name, rotated_angle=self.location.rotated_angle)
42-
cells_with_meta = [[CellWithMeta.create_from_cell(cell) for cell in row] for row in self.matrix_cells]
43-
return Table(metadata=metadata, cells=cells_with_meta)
44-
45-
@staticmethod
46-
def get_cells_text(attr_cells: List[List[Cell]]) -> List[List[str]]:
47-
attrs = []
48-
for i in range(0, len(attr_cells)):
49-
attrs.append([a.get_text() for a in attr_cells[i]])
50-
51-
return attrs
52-
53-
@staticmethod
54-
def get_key_value_attrs(attrs: List, val: Any) -> dict: # noqa
55-
res_attrs = []
56-
for i in range(0, len(attrs)):
57-
res_attrs.append({"attr": attrs[i]})
58-
res = {
59-
"attrs": res_attrs,
60-
"val": val
61-
}
62-
return res
63-
64-
@staticmethod
65-
def get_index_of_end_string_attr(matrix_cells: List[List[Cell]]) -> int:
66-
end_attr_string = 0
67-
for i in range(0, len(matrix_cells)):
68-
if matrix_cells[i][0].is_attribute:
69-
end_attr_string = i
70-
71-
return end_attr_string
72-
73-
@staticmethod
74-
def get_attributes_cell(matrix_cells: List[List[Cell]]) -> (List[int], List[List[Cell]], int):
75-
import copy
76-
import numpy as np
77-
78-
required_columns = []
79-
for j in range(0, len(matrix_cells[0])):
80-
if matrix_cells[0][j].is_attribute_required:
81-
required_columns.append(j)
82-
83-
end_attr_string = ScanTable.get_index_of_end_string_attr(matrix_cells)
84-
85-
attrs = copy.deepcopy(np.array(matrix_cells[0:end_attr_string + 1]))
86-
attrs = attrs.transpose().tolist()
87-
88-
return [required_columns, attrs, end_attr_string]
89-
90-
@staticmethod
91-
def get_matrix_attrs_and_data(matrix_cells: List[List[Cell]]) -> (List[List[Cell]], List[List[str]], List[List[str]]):
92-
required_columns, attrs, end_attr_string = ScanTable.get_attributes_cell(matrix_cells)
93-
attrs_text = ScanTable.get_cells_text(attrs)
94-
95-
data = matrix_cells[(end_attr_string + 1):]
96-
data_text = ScanTable.get_cells_text(data)
97-
98-
return [attrs, attrs_text, data_text]
36+
def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]:
37+
return [[cell.get_text() for cell in row] for row in cells]
9938

10039
@property
10140
def location(self) -> Location:
10241
return min(self.locations)
10342

10443
@property
10544
def uid(self) -> str:
106-
return self.name
45+
return self.metadata.uid
10746

10847
def to_dict(self) -> dict:
10948
from collections import OrderedDict
11049

111-
data_text = ScanTable.get_cells_text(self.matrix_cells)
50+
data_text = self.__get_cells_text(self.cells)
11251

11352
res = OrderedDict()
11453
res["locations"] = [location.to_dict() for location in self.locations]

dedoc/readers/pdf_reader/pdf_base_reader.py

+2-20
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515

1616

1717
ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
18-
"orient_analysis_cells",
19-
"orient_cell_angle",
2018
"is_one_column_document",
2119
"document_orientation",
2220
"language",
@@ -73,8 +71,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
7371

7472
params_for_parse = ParametersForParseDoc(
7573
language=param_utils.get_param_language(parameters),
76-
orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
77-
orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters),
7874
is_one_column_document=param_utils.get_param_is_one_column_document(parameters),
7975
document_orientation=param_utils.get_param_document_orientation(parameters),
8076
need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters),
@@ -91,12 +87,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
9187
)
9288

9389
lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
94-
tables = [scan_table.to_table() for scan_table in scan_tables]
9590

9691
if params_for_parse.with_attachments and self.attachment_extractor.can_extract(file_path):
9792
attachments += self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
9893

99-
result = UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=warnings, metadata=metadata)
94+
result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata)
10095
return self._postprocess(result)
10196

10297
def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
@@ -177,7 +172,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[Scan
177172
table_page_number = location.page_number
178173
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
179174
page_number = scan_table.locations[0].page_number
180-
for row in scan_table.matrix_cells:
175+
for row in scan_table.cells:
181176
for cell in row:
182177
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
183178
shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
@@ -275,16 +270,3 @@ def _binarization(self, gray_image: ndarray) -> ndarray:
275270
binary_mask = gray_image >= np.quantile(gray_image, 0.05)
276271
gray_image[binary_mask] = 255
277272
return gray_image
278-
279-
def eval_tables_by_batch(self,
280-
batch: Iterator[ndarray],
281-
page_number_begin: int,
282-
language: str,
283-
orient_analysis_cells: bool = False,
284-
orient_cell_angle: int = 270,
285-
table_type: str = "") -> Tuple[List[ndarray], List[ScanTable]]:
286-
from joblib import Parallel, delayed
287-
288-
result_batch = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.table_recognizer.recognize_tables_from_image)(
289-
image, page_number_begin + i, language, orient_analysis_cells, orient_cell_angle, table_type) for i, image in enumerate(batch))
290-
return result_batch

dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py

-2
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,6 @@ def _process_one_page(self,
8585
image=rotated_image,
8686
page_number=page_number,
8787
language=parameters.language,
88-
orient_analysis_cells=parameters.orient_analysis_cells,
89-
orient_cell_angle=parameters.orient_cell_angle,
9088
table_type=parameters.table_type
9189
)
9290
else:

0 commit comments

Comments
 (0)