Merge pull request #8 from VikParuchuri/dev

VikParuchuri · web-flow · commit c2067a21b857 · 2024-01-16T11:59:58.000-08:00
Allow for non-axis-aligned bboxes
diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ Surya is named after the [Hindu sun god](https://en.wikipedia.org/wiki/Surya), w
 | Presentation     | [Image](static/images/pres.png)     |
 | Scientific Paper | [Image](static/images/paper.png)    |
 | Scanned Document | [Image](static/images/scanned.png)  |
+| Scanned Form | [Image](static/images/funsd.png)    |
 
 # Installation
 
@@ -58,7 +59,13 @@ surya_detect DATA_PATH --images
 - `--max` specifies the maximum number of pages to process if you don't want to process everything
 - `--results_dir` specifies the directory to save results to instead of the default
 
-This has worked with every language I've tried.  It will work best with documents, and may not work well with photos or other images.  It will also not work well with handwriting.
+The `results.json` file will contain these keys for each page of the input document(s):
+
+- `polygons` - polygons for each detected text line (these are more accurate than the bboxes) in (x1, y1), (x2, y2), (x3, y3), (x4, y4) format.  The points are in clockwise order from the top left.
+- `bboxes` - axis-aligned rectangles for each detected text line in (x1, y1, x2, y2) format.  (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner.
+- `vertical_lines` - vertical lines detected in the document in (x1, y1, x2, y2) format.
+- `horizontal_lines` - horizontal lines detected in the document in (x1, y1, x2, y2) format.
+- `page_number` - the page number of the document
 
 **Performance tips**
 
@@ -102,8 +109,10 @@ If you want to develop surya, you can install it manually:
 
 # Limitations
 
-- This is specialized for document OCR.  It will likely not work on photos or other images.  It will also not work on handwritten text.
-- Does not work well with images that look like ads or other parts of documents that are usually ignored.
+- This is specialized for document OCR.  It will likely not work on photos or other images.
+- It is for printed text, not handwriting.
+- The model has trained itself to ignore advertisements.
+- This has worked for every language I've tried, but languages with very different character sets may not work well.
 
 # Benchmarks
 
diff --git a/benchmark/detection.py b/benchmark/detection.py
@@ -9,7 +9,7 @@
 from surya.model.segformer import load_model, load_processor
 from surya.model.processing import open_pdf, get_page_images
 from surya.detection import batch_inference
-from surya.postprocessing.heatmap import draw_bboxes_on_image
+from surya.postprocessing.heatmap import draw_bboxes_on_image, draw_polys_on_image
 from surya.postprocessing.util import rescale_bbox
 from surya.settings import settings
 import os
@@ -68,6 +68,7 @@ def main():
     page_metrics = collections.OrderedDict()
     for idx, (tb, sb, cb) in enumerate(zip(tess_predictions, predictions, correct_boxes)):
         surya_boxes = sb["bboxes"]
+        surya_polys = sb["polygons"]
 
         surya_metrics = precision_recall(surya_boxes, cb)
         tess_metrics = precision_recall(tb, cb)
@@ -78,7 +79,7 @@ def main():
         }
 
         if args.debug:
-            bbox_image = draw_bboxes_on_image(surya_boxes, copy.deepcopy(images[idx]))
+            bbox_image = draw_polys_on_image(surya_polys, copy.deepcopy(images[idx]))
             bbox_image.save(os.path.join(result_path, f"{idx}_bbox.png"))
 
     mean_metrics = {}
diff --git a/detect_text.py b/detect_text.py
@@ -9,7 +9,7 @@
 from surya.model.processing import open_pdf, get_page_images
 from surya.detection import batch_inference
 from surya.postprocessing.affinity import draw_lines_on_image
-from surya.postprocessing.heatmap import draw_bboxes_on_image
+from surya.postprocessing.heatmap import draw_bboxes_on_image, draw_polys_on_image
 from surya.settings import settings
 import os
 import filetype
@@ -90,7 +90,7 @@ def main():
 
     if args.images:
         for idx, (image, pred, name) in enumerate(zip(images, predictions, names)):
-            bbox_image = draw_bboxes_on_image(pred["bboxes"], copy.deepcopy(image))
+            bbox_image = draw_polys_on_image(pred["polygons"], copy.deepcopy(image))
             bbox_image.save(os.path.join(result_path, f"{name}_{idx}_bbox.png"))
 
             column_image = draw_lines_on_image(pred["vertical_lines"], copy.deepcopy(image))
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.1.5"
+version = "0.1.6"
 description = "Document OCR models for multilingual text detection and recognition"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/static/images/funsd.png b/static/images/funsd.png
diff --git a/surya/detection.py b/surya/detection.py
@@ -84,11 +84,13 @@ def batch_inference(images: List, model, processor):
         affinity_size = list(reversed(affinity_map.shape))
         heatmap_size = list(reversed(heatmap.shape))
         bboxes = get_and_clean_boxes(heatmap, heatmap_size, orig_sizes[i])
+        bbox_data = [bbox.model_dump() for bbox in bboxes]
         vertical_lines = get_vertical_lines(affinity_map, affinity_size, orig_sizes[i])
         horizontal_lines = get_horizontal_lines(affinity_map, affinity_size, orig_sizes[i])
 
         results.append({
-            "bboxes": bboxes,
+            "bboxes": [bbd["bbox"] for bbd in bbox_data],
+            "polygons": [bbd["corners"] for bbd in bbox_data],
             "vertical_lines": vertical_lines,
             "horizontal_lines": horizontal_lines,
             "heatmap": heat_img,
diff --git a/surya/postprocessing/heatmap.py b/surya/postprocessing/heatmap.py
@@ -1,24 +1,32 @@
+from typing import List
+
 import numpy as np
 import cv2
 import math
 from PIL import ImageDraw
 
 from surya.postprocessing.util import rescale_bbox
+from surya.schema import PolygonBox
 from surya.settings import settings
 
 
-def clean_contained_boxes(boxes):
+def clean_contained_boxes(boxes: List[PolygonBox]):
     new_boxes = []
-    for box in boxes:
+    for box_obj in boxes:
+        box = box_obj.bbox
         contained = False
-        for other_box in boxes:
+        for other_box_obj in boxes:
+            if other_box_obj.corners == box_obj.corners:
+                continue
+
+            other_box = other_box_obj.bbox
             if box == other_box:
                 continue
             if box[0] >= other_box[0] and box[1] >= other_box[1] and box[2] <= other_box[2] and box[3] <= other_box[3]:
                 contained = True
                 break
         if not contained:
-            new_boxes.append(box)
+            new_boxes.append(box_obj)
     return new_boxes
 
 
@@ -93,23 +101,14 @@ def get_detected_boxes(textmap, text_threshold=settings.DETECTOR_TEXT_THRESHOLD,
     textmap = textmap.astype(np.float32)
     boxes, labels = detect_boxes(textmap, text_threshold, low_text)
     # From point form to box form
-    boxes = [
-        [box[0][0], box[0][1], box[1][0], box[2][1]]
-        for box in boxes
-    ]
-
-    # Ensure correct box format
-    for box in boxes:
-        if box[0] > box[2]:
-            box[0], box[2] = box[2], box[0]
-        if box[1] > box[3]:
-            box[1], box[3] = box[3], box[1]
+    boxes = [PolygonBox(corners=box) for box in boxes]
     return boxes
 
 
 def get_and_clean_boxes(textmap, processor_size, image_size):
     bboxes = get_detected_boxes(textmap)
-    bboxes = [rescale_bbox(bbox, processor_size, image_size) for bbox in bboxes]
+    for bbox in bboxes:
+        bbox.rescale(processor_size, image_size)
     bboxes = clean_contained_boxes(bboxes)
     return bboxes
 
@@ -122,3 +121,14 @@ def draw_bboxes_on_image(bboxes, image):
 
     return image
 
+
+def draw_polys_on_image(corners, image):
+    draw = ImageDraw.Draw(image)
+
+    for poly in corners:
+        poly = [(p[0], p[1]) for p in poly]
+        draw.polygon(poly, outline='red', width=1)
+
+    return image
+
+
diff --git a/surya/postprocessing/util.py b/surya/postprocessing/util.py
@@ -23,4 +23,22 @@ def rescale_bbox(bbox, processor_size, image_size):
     new_bbox[1] = int(new_bbox[1] * height_scaler)
     new_bbox[2] = int(new_bbox[2] * width_scaler)
     new_bbox[3] = int(new_bbox[3] * height_scaler)
-    return new_bbox
+    return new_bbox
+
+
+def rescale_point(point, processor_size, image_size):
+    # Point is in x, y format
+    page_width, page_height = processor_size
+
+    img_width, img_height = image_size
+    width_scaler = img_width / page_width
+    height_scaler = img_height / page_height
+
+    new_point = copy.deepcopy(point)
+    new_point[0] = int(new_point[0] * width_scaler)
+    new_point[1] = int(new_point[1] * height_scaler)
+    return new_point
+
+
+def rescale_points(points, processor_size, image_size):
+    return [rescale_point(point, processor_size, image_size) for point in points]
diff --git a/surya/schema.py b/surya/schema.py
@@ -0,0 +1,80 @@
+import copy
+from typing import List, Tuple
+
+from pydantic import BaseModel, field_validator, computed_field
+
+
+class PolygonBox(BaseModel):
+    corners: List[List[float]]
+
+    @field_validator('corners')
+    @classmethod
+    def check_elements(cls, v: List[List[float]]) -> List[List[float]]:
+        if len(v) != 4:
+            raise ValueError('corner must have 4 elements')
+
+        for corner in v:
+            if len(corner) != 2:
+                raise ValueError('corner must have 2 elements')
+        return v
+
+    @property
+    def height(self):
+        return self.corners[1][1] - self.corners[0][1]
+
+    @property
+    def width(self):
+        return self.corners[1][0] - self.corners[0][0]
+
+    @property
+    def area(self):
+        return self.width * self.height
+
+    @computed_field
+    @property
+    def bbox(self) -> List[float]:
+        box = [self.corners[0][0], self.corners[0][1], self.corners[1][0], self.corners[2][1]]
+        if box[0] > box[2]:
+            box[0], box[2] = box[2], box[0]
+        if box[1] > box[3]:
+            box[1], box[3] = box[3], box[1]
+        return box
+
+
+    def rescale(self, processor_size, image_size):
+        # Point is in x, y format
+        page_width, page_height = processor_size
+
+        img_width, img_height = image_size
+        width_scaler = img_width / page_width
+        height_scaler = img_height / page_height
+
+        new_corners = copy.deepcopy(self.corners)
+        for corner in new_corners:
+            corner[0] = int(corner[0] * width_scaler)
+            corner[1] = int(corner[1] * height_scaler)
+        self.corners = new_corners
+
+
+
+class Bbox(BaseModel):
+    bbox: List[float]
+
+    @field_validator('bbox')
+    @classmethod
+    def check_4_elements(cls, v: List[float]) -> List[float]:
+        if len(v) != 4:
+            raise ValueError('bbox must have 4 elements')
+        return v
+
+    @property
+    def height(self):
+        return self.bbox[3] - self.bbox[1]
+
+    @property
+    def width(self):
+        return self.bbox[2] - self.bbox[0]
+
+    @property
+    def area(self):
+        return self.width * self.height