Skip to content

Commit e790f50

Browse files
committed
Benchmark updates
1 parent a7b7700 commit e790f50

File tree

6 files changed

+47
-35
lines changed

6 files changed

+47
-35
lines changed

README.md

+10-12
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ The `results.json` file will contain a json dictionary where the keys are the in
222222

223223
**Performance tips**
224224

225-
Setting the `DETECTOR_BATCH_SIZE` env var properly will make a big difference when using a GPU. Each batch item will use `400MB` of VRAM, so very high batch sizes are possible. The default is a batch size `36`, which will use about 16GB of VRAM. Depending on your CPU core count, it might help, too - the default CPU batch size is `6`.
225+
Setting the `LAYOUT_BATCH_SIZE` env var properly will make a big difference when using a GPU. Each batch item will use `220MB` of VRAM, so very high batch sizes are possible. The default is a batch size `32`, which will use about 7GB of VRAM. Depending on your CPU core count, it might help, too - the default CPU batch size is `4`.
226226

227227
### From python
228228

@@ -278,7 +278,6 @@ The `results.json` file will contain a json dictionary where the keys are the in
278278

279279
Setting the `TABLE_REC_BATCH_SIZE` env var properly will make a big difference when using a GPU. Each batch item will use `150MB` of VRAM, so very high batch sizes are possible. The default is a batch size `64`, which will use about 10GB of VRAM. Depending on your CPU core count, it might help, too - the default CPU batch size is `8`.
280280

281-
282281
# Limitations
283282

284283
- This is specialized for document OCR. It will likely not work on photos or other images.
@@ -364,16 +363,15 @@ Then we calculate precision and recall for the whole dataset.
364363

365364
## Layout analysis
366365

367-
![Benchmark chart](static/images/benchmark_layout_chart.png)
368-
369-
| Layout Type | precision | recall |
370-
| ----------- | --------- | ------ |
371-
| Image | 0.97 | 0.96 |
372-
| Table | 0.99 | 0.99 |
373-
| Text | 0.9 | 0.97 |
374-
| Title | 0.94 | 0.88 |
366+
| Layout Type | precision | recall |
367+
|---------------|-------------|----------|
368+
| Image | 0.91265 | 0.93976 |
369+
| List | 0.80849 | 0.86792 |
370+
| Table | 0.84957 | 0.96104 |
371+
| Text | 0.93019 | 0.94571 |
372+
| Title | 0.92102 | 0.95404 |
375373

376-
Time per image - .4 seconds on GPU (A10).
374+
Time per image - .13 seconds on GPU (A10).
377375

378376
**Methodology**
379377

@@ -384,7 +382,7 @@ I benchmarked the layout analysis on [Publaynet](https://github.com/ibm-aur-nlp/
384382

385383
## Reading Order
386384

387-
75% mean accuracy, and .14 seconds per image on an A6000 GPU. See methodology for notes - this benchmark is not perfect measure of accuracy, and is more useful as a sanity check.
385+
88% mean accuracy, and .4 seconds per image on an A10 GPU. See methodology for notes - this benchmark is not perfect measure of accuracy, and is more useful as a sanity check.
388386

389387
**Methodology**
390388

benchmark/layout.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,11 @@ def main():
4444
os.makedirs(result_path, exist_ok=True)
4545

4646
label_alignment = { # First is publaynet, second is surya
47-
"Image": [["Figure"], ["Picture", "Figure", "Handwriting"]],
47+
"Image": [["Figure"], ["Picture", "Figure"]],
4848
"Table": [["Table"], ["Table", "Form", "TableOfContents"]],
49-
"Text": [["Text", "List"], ["Text", "Formula", "Footnote", "Caption", "ListItem", "TextInlineMath", "Code", "PageFooter"]],
50-
"Title": [["Title"], ["SectionHeader", "Title", "PageHeader"]]
49+
"Text": [["Text"], ["Text", "Formula", "Footnote", "Caption", "TextInlineMath", "Code", "Handwriting"]],
50+
"List": [["List"], ["ListItem"]],
51+
"Title": [["Title"], ["SectionHeader", "Title"]]
5152
}
5253

5354
page_metrics = collections.OrderedDict()

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "surya-ocr"
3-
version = "0.6.13"
3+
version = "0.7.0"
44
description = "OCR, layout, reading order, and table recognition in 90+ languages"
55
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
66
readme = "README.md"

surya/benchmark/metrics.py

+29-17
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,9 @@
22
from itertools import repeat
33

44
import numpy as np
5-
from concurrent.futures import ProcessPoolExecutor
5+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
66

77

8-
def intersection_area(box1, box2):
9-
x_left = max(box1[0], box2[0])
10-
y_top = max(box1[1], box2[1])
11-
x_right = min(box1[2], box2[2])
12-
y_bottom = min(box1[3], box2[3])
13-
14-
if x_right < x_left or y_bottom < y_top:
15-
return 0.0
16-
17-
return (x_right - x_left) * (y_bottom - y_top)
18-
198
def box_area(box):
209
return (box[2] - box[0]) * (box[3] - box[1])
2110

@@ -109,16 +98,39 @@ def calculate_coverage(box, other_boxes, penalize_double=False):
10998
return covered_pixels_count / box_area
11099

111100

101+
def intersection_area(box1, box2):
102+
x_left = max(box1[0], box2[0])
103+
y_top = max(box1[1], box2[1])
104+
x_right = min(box1[2], box2[2])
105+
y_bottom = min(box1[3], box2[3])
106+
107+
if x_right < x_left or y_bottom < y_top:
108+
return 0.0
109+
110+
return (x_right - x_left) * (y_bottom - y_top)
111+
112+
112113
def calculate_coverage_fast(box, other_boxes, penalize_double=False):
114+
box = np.array(box)
115+
other_boxes = np.array(other_boxes)
116+
117+
# Calculate box area
113118
box_area = (box[2] - box[0]) * (box[3] - box[1])
114119
if box_area == 0:
115120
return 0
116121

117-
total_intersect = 0
118-
for other_box in other_boxes:
119-
total_intersect += intersection_area(box, other_box)
122+
x_left = np.maximum(box[0], other_boxes[:, 0])
123+
y_top = np.maximum(box[1], other_boxes[:, 1])
124+
x_right = np.minimum(box[2], other_boxes[:, 2])
125+
y_bottom = np.minimum(box[3], other_boxes[:, 3])
126+
127+
widths = np.maximum(0, x_right - x_left)
128+
heights = np.maximum(0, y_bottom - y_top)
129+
intersect_areas = widths * heights
130+
131+
total_intersect = np.sum(intersect_areas)
120132

121-
return min(1, total_intersect / box_area)
133+
return min(1.0, total_intersect / box_area)
122134

123135

124136
def precision_recall(preds, references, threshold=.5, workers=8, penalize_double=True):
@@ -139,7 +151,7 @@ def precision_recall(preds, references, threshold=.5, workers=8, penalize_double
139151
if penalize_double:
140152
coverage_func = calculate_coverage
141153

142-
with ProcessPoolExecutor(max_workers=workers) as executor:
154+
with ThreadPoolExecutor(max_workers=workers) as executor:
143155
precision_func = partial(coverage_func, penalize_double=penalize_double)
144156
precision_iou = executor.map(precision_func, preds, repeat(references))
145157
reference_iou = executor.map(coverage_func, references, repeat(preds))

surya/layout.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
157157
prediction["pause_tokens"] = last_prediction["pause_tokens"]
158158
prediction["token"].fill_(model.decoder.config.pause_token_id)
159159
batch_decoder_input[j, :] = model.decoder.config.pause_token_id
160-
elif intersects_other_boxes(prediction["polygon"], [p["polygon"] for p in batch_predictions[j]], thresh=.7):
160+
elif intersects_other_boxes(prediction["polygon"], [p["polygon"] for p in batch_predictions[j]], thresh=.4):
161161
prediction["paused"] = True
162162
prediction["pause_tokens"] = 1
163163
prediction["token"].fill_(model.decoder.config.pause_token_id)

surya/settings.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ def TORCH_DEVICE_MODEL(self) -> str:
6969
LAYOUT_IMAGE_SIZE: Dict = {"height": 768, "width": 768}
7070
LAYOUT_BATCH_SIZE: Optional[int] = None
7171
LAYOUT_BENCH_DATASET_NAME: str = "vikp/publaynet_bench"
72-
LAYOUT_MAX_BOXES: int = 150
72+
LAYOUT_MAX_BOXES: int = 100
7373
COMPILE_LAYOUT: bool = False
74+
ORDER_BENCH_DATASET_NAME: str = "vikp/order_bench"
7475

7576
# Table Rec
7677
TABLE_REC_MODEL_CHECKPOINT: str = "vikp/surya_tablerec"

0 commit comments

Comments
 (0)