Add slicing logic

VikParuchuri · VikParuchuri · commit 51c4c5f32f66 · 2024-11-26T10:59:01.000-05:00
diff --git a/surya/input/slicing.py b/surya/input/slicing.py
@@ -0,0 +1,129 @@
+import math
+from typing import List, Tuple
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from surya.schema import LayoutResult
+
+SLICES_TYPE = Tuple[List[Image.Image], List[Tuple[int, int, int]]]
+
+
+class ImageSlicer:
+    merge_tolerance = .05
+
+    def __init__(self, slice_min_dims, max_slices=4):
+        self.slice_min_dims = slice_min_dims
+        self.max_slices = max_slices
+
+    def slice(self, images: List[Image.Image]) -> SLICES_TYPE:
+        all_slices = []
+        all_positions = []
+
+        for idx, image in enumerate(images):
+            if (image.size[0] > self.slice_min_dims["width"] or
+                    image.size[1] > self.slice_min_dims["height"]):
+                img_slices, positions = self._slice_image(image, idx)
+                all_slices.extend(img_slices)
+                all_positions.extend(positions)
+            else:
+                all_slices.append(image)
+                all_positions.append((idx, 0, 0))
+
+        return all_slices, all_positions
+
+    def slice_count(self, image: Image.Image) -> int:
+        width, height = image.size
+        if width > height:
+            slice_size = self._calculate_slice_size(width, "width")
+            return math.ceil(width / slice_size)
+        else:
+            slice_size = self._calculate_slice_size(height, "height")
+            return math.ceil(height / slice_size)
+
+    def _calculate_slice_size(self, dimension: int, dim_type: str) -> int:
+        min_size = self.slice_min_dims[dim_type]
+        return max(min_size, (dimension // self.max_slices + 1))
+
+    def _slice_image(self, image: Image.Image, idx: int) -> SLICES_TYPE:
+        width, height = image.size
+        slices = []
+        positions = []
+
+        if width > height:
+            slice_size = self._calculate_slice_size(width, "width")
+            for i, x in enumerate(range(0, width, slice_size)):
+                slice_end = min(x + slice_size, width)
+                slices.append(image.crop((x, 0, slice_end, height)))
+                positions.append((idx, i, 0))
+        else:
+            slice_size = self._calculate_slice_size(height, "height")
+            for i, y in enumerate(range(0, height, slice_size)):
+                slice_end = min(y + slice_size, height)
+                slices.append(image.crop((0, y, width, slice_end)))
+                positions.append((idx, 0, i))
+
+        return slices, positions
+
+    def join(self, results: List[LayoutResult], tile_positions: List[Tuple[int, int, int]]) -> List[LayoutResult]:
+        new_results = []
+        current_result = None
+        for idx, (result, tile_position) in enumerate(zip(results, tile_positions)):
+            image_idx, tile_x, tile_y = tile_position
+            if idx == 0 or image_idx != tile_positions[idx - 1][0]:
+                if current_result is not None:
+                    new_results.append(current_result)
+                current_result = result
+            else:
+                merge_dir = "width" if tile_x > 0 else "height"
+                current_result = self.merge_results(current_result, result, merge_dir=merge_dir)
+        if current_result is not None:
+            new_results.append(current_result)
+        return new_results
+
+
+    def merge_results(self, res1: LayoutResult, res2: LayoutResult, merge_dir="width") -> LayoutResult:
+        new_image_bbox = res1.image_bbox.copy()
+        to_remove_idxs = set()
+        if merge_dir == "width":
+            new_image_bbox[2] += res2.image_bbox[2]
+            max_position = max([box.position for box in res1.bboxes])
+            for i, box2 in enumerate(res2.bboxes):
+                box2.shift(x_shift=res1.image_bbox[2])
+                box2.position += max_position
+                for j, box1 in enumerate(res1.bboxes):
+                    if all([
+                        box1.intersection_area(box2, x_margin=.1) > self.merge_tolerance,
+                        (
+                                box1.y_overlap(box2, y_margin=.1) > box1.height // 2 or
+                                box2.y_overlap(box1, y_margin=.1) > box2.height // 2
+                        ),
+                        box1.label == box2.label
+                        ]):
+                        box1.merge(box2)
+                        to_remove_idxs.add(i)
+
+        elif merge_dir == "height":
+            new_image_bbox[3] += res2.image_bbox[3]
+            max_position = max([box.position for box in res1.bboxes])
+            for i, box2 in enumerate(res2.bboxes):
+                box2.shift(y_shift=res1.image_bbox[3])
+                box2.position += max_position
+                for j, box1 in enumerate(res1.bboxes):
+                    if all([
+                        box1.intersection_area(box2, y_margin=.1) > self.merge_tolerance,
+                        (
+                                box1.x_overlap(box2, x_margin=.1) > box1.width // 2 or
+                                box2.x_overlap(box1, x_margin=.1) > box2.width // 2
+                        ),
+                        box1.label == box2.label
+                        ]):
+                        box1.merge(box2)
+                        to_remove_idxs.add(i)
+
+        new_result = LayoutResult(
+            image_bbox=new_image_bbox,
+            bboxes=res1.bboxes + [b for i, b in enumerate(res2.bboxes) if i not in to_remove_idxs]
+        )
+        return new_result
diff --git a/surya/layout.py b/surya/layout.py
@@ -6,6 +6,7 @@
 
 from tqdm import tqdm
 
+from surya.input.slicing import ImageSlicer
 from surya.model.layout.config import ID_TO_LABEL
 from surya.postprocessing.heatmap import clean_boxes, intersects_other_boxes
 from surya.schema import LayoutResult, LayoutBox
@@ -68,10 +69,31 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
     if batch_size is None:
         batch_size = get_batch_size()
 
+    slicer = ImageSlicer(settings.LAYOUT_SLICE_SIZE)
+
+    batches = []
+    img_counts = [slicer.slice_count(image) for image in images]
+
+    start_idx = 0
+    end_idx = 1
+    while end_idx < len(img_counts):
+        if any([
+            sum(img_counts[start_idx:end_idx]) >= batch_size,
+            sum(img_counts[start_idx:end_idx + 1]) > batch_size,
+            ]):
+            batches.append((start_idx, end_idx))
+            start_idx = end_idx
+        end_idx += 1
+
+    if start_idx < len(img_counts):
+        batches.append((start_idx, len(img_counts)))
+
     results = []
-    for i in tqdm(range(0, len(images), batch_size), desc="Recognizing layout"):
-        batch_images = images[i:i+batch_size]
+    for (start_idx, end_idx) in tqdm(batches, desc="Recognizing layout"):
+        batch_results = []
+        batch_images = images[start_idx:end_idx]
         batch_images = [image.convert("RGB") for image in batch_images]  # also copies the image
+        batch_images, tile_positions = slicer.slice(batch_images)
         current_batch_size = len(batch_images)
 
         orig_sizes = [image.size for image in batch_images]
@@ -84,15 +106,15 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
         start_token = [model.config.decoder.bos_token_id] * 7
         batch_decoder_input = [
             [start_token] + [pause_token] * model.config.decoder.pause_token_count
-            for j in range(current_batch_size)
+            for _ in range(current_batch_size)
         ]
         batch_decoder_input = torch.tensor(np.stack(batch_decoder_input, axis=0), dtype=torch.long, device=model.device)
         inference_token_count = batch_decoder_input.shape[1]
 
         decoder_position_ids = torch.ones_like(batch_decoder_input[0, :, 0], dtype=torch.int64, device=model.device).cumsum(0) - 1
         model.decoder.model._setup_cache(model.config, batch_size, model.device, model.dtype)
 
-        batch_predictions = [[] for _ in range(len(images))]
+        batch_predictions = [[] for _ in range(current_batch_size)]
 
         with torch.inference_mode():
             encoder_hidden_states = model.encoder(pixel_values=batch_pixel_values)[0]
@@ -188,5 +210,11 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
                 bboxes=boxes,
                 image_bbox=[0, 0, orig_size[0], orig_size[1]]
             )
-            results.append(result)
+            batch_results.append(result)
+
+        assert len(batch_results) == len(tile_positions)
+        batch_results = slicer.join(batch_results, tile_positions)
+        results.extend(batch_results)
+
+    assert len(results) == len(images)
     return results
diff --git a/surya/schema.py b/surya/schema.py
@@ -72,10 +72,16 @@ def merge(self, other):
         self.polygon = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
 
     def intersection_area(self, other, x_margin=0, y_margin=0):
-        x_overlap = max(0, min(self.bbox[2] + x_margin, other.bbox[2] + x_margin) - max(self.bbox[0] - x_margin, other.bbox[0] - x_margin))
-        y_overlap = max(0, min(self.bbox[3] + y_margin, other.bbox[3] + y_margin) - max(self.bbox[1] - y_margin, other.bbox[1] - y_margin))
+        x_overlap = self.x_overlap(other, x_margin)
+        y_overlap = self.y_overlap(other, y_margin)
         return x_overlap * y_overlap
 
+    def x_overlap(self, other, x_margin=0):
+        return max(0, min(self.bbox[2] + x_margin, other.bbox[2] + x_margin) - max(self.bbox[0] - x_margin, other.bbox[0] - x_margin))
+
+    def y_overlap(self, other, y_margin=0):
+        return max(0, min(self.bbox[3] + y_margin, other.bbox[3] + y_margin) - max(self.bbox[1] - y_margin, other.bbox[1] - y_margin))
+
     def intersection_pct(self, other, x_margin=0, y_margin=0):
         assert 0 <= x_margin <= 1
         assert 0 <= y_margin <= 1
@@ -90,6 +96,14 @@ def intersection_pct(self, other, x_margin=0, y_margin=0):
         intersection = self.intersection_area(other, x_margin, y_margin)
         return intersection / self.area
 
+    def shift(self, x_shift: float | None = None, y_shift: float | None = None):
+        if x_shift is not None:
+            for corner in self.polygon:
+                corner[0] += x_shift
+        if y_shift is not None:
+            for corner in self.polygon:
+                corner[1] += y_shift
+
 
 class Bbox(BaseModel):
     bbox: List[float]
diff --git a/surya/settings.py b/surya/settings.py
@@ -65,8 +65,9 @@ def TORCH_DEVICE_MODEL(self) -> str:
     RECOGNITION_ENCODER_BATCH_DIVISOR: int = 1 # Divisor for batch size in decoder
 
     # Layout
-    LAYOUT_MODEL_CHECKPOINT: str = "datalab-to/layout_order_hr4"
+    LAYOUT_MODEL_CHECKPOINT: str = "datalab-to/layout_order_hr3"
     LAYOUT_IMAGE_SIZE: Dict = {"height": 768, "width": 768}
+    LAYOUT_SLICE_SIZE: Dict = {"height": 1200, "width": 1200} # When to start slicing images
     LAYOUT_BATCH_SIZE: Optional[int] = None
     LAYOUT_BENCH_DATASET_NAME: str = "vikp/publaynet_bench"
     LAYOUT_MAX_BOXES: int = 100