Add pause tokens

VikParuchuri · VikParuchuri · commit defbfcac245b · 2024-11-22T16:19:45.000-05:00
diff --git a/surya/layout.py b/surya/layout.py
@@ -67,7 +67,7 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
         batch_pixel_values = model_inputs["pixel_values"]
         batch_pixel_values = torch.tensor(np.array(batch_pixel_values), dtype=model.dtype).to(model.device)
 
-        pause_token = [model.config.decoder.size_token_id] * 7
+        pause_token = [model.config.decoder.pause_token_id] * 7
         start_token = [model.config.decoder.bos_token_id] * 7
         batch_decoder_input = [
             [start_token] + [pause_token] * model.config.decoder.pause_token_count
@@ -80,12 +80,14 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
         model.decoder.model._setup_cache(model.config, batch_size, model.device, model.dtype)
 
         batch_predictions = [[] for _ in range(len(images))]
+        batch_entropies = [[] for _ in range(len(images))]
 
         with torch.inference_mode():
             encoder_hidden_states = model.encoder(pixel_values=batch_pixel_values)[0]
 
             token_count = 0
             all_done = torch.zeros(current_batch_size, dtype=torch.bool, device=model.device)
+            paused = [False] * current_batch_size
 
             while token_count < settings.LAYOUT_MAX_BOXES:
                 is_prefill = token_count == 0
@@ -101,6 +103,9 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
                 box_logits = return_dict["bbox_logits"][:current_batch_size, -1, :].detach()
                 class_logits = return_dict["class_logits"][:current_batch_size, -1, :].detach()
 
+                probs = torch.nn.functional.softmax(class_logits, dim=-1).detach().cpu()
+                entropy = torch.special.entr(probs).sum(dim=-1)
+
                 class_preds = class_logits.argmax(-1)
                 box_preds = box_logits * model.config.decoder.bbox_size
 
@@ -115,7 +120,20 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
 
                 for j, (pred, status) in enumerate(zip(batch_decoder_input, all_done)):
                     if not status:
-                        batch_predictions[j].append(pred[0].detach().clone())
+                        if paused[j]:
+                            if len(batch_entropies[j]) == 0 or entropy[j].item() < batch_entropies[j][-1]:
+                                batch_predictions[j][-1] = pred[0].detach().clone()
+                                batch_entropies[j][-1] = entropy[j].item()
+                        else:
+                            batch_predictions[j].append(pred[0].detach().clone())
+                            batch_entropies[j].append(entropy[j].item())
+
+                        # Add a pause token if needed
+                        if entropy[j].item() > .75 and not paused[j]:
+                            paused[j] = True
+                            batch_decoder_input[j, :] = model.decoder.config.pause_token_id
+                        else:
+                            paused[j] = False
 
                 token_count += inference_token_count
                 inference_token_count = batch_decoder_input.shape[1]
@@ -124,6 +142,7 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
         for j, (preds, orig_size) in enumerate(zip(batch_predictions, orig_sizes)):
             boxes = []
             if len(preds) > 0:
+                preds = [p for p in preds if p[6] > model.decoder.config.special_token_count] # Remove special tokens, like pause
                 stacked_preds = torch.stack(preds, dim=0)
                 polygons = prediction_to_polygon(
                     stacked_preds,
diff --git a/surya/model/common/donut/encoder.py b/surya/model/common/donut/encoder.py
@@ -617,9 +617,31 @@ def __init__(self, config, layer_num, dim, input_resolution, depth, num_heads, n
 
         self.pointing = False
 
-        self.position_embeddings = None
-        if layer_num == 0 and config.starting_positional_embeddings:
-            self.position_embeddings = nn.Parameter(torch.zeros(1, input_resolution[0] * input_resolution[1] + config.encoder_length, dim))
+        self.positional_encoding = None
+        if config.use_positional_embeddings:
+            self.positional_encoding = self.build_2d_sincos_position_embedding(
+                input_resolution[1],
+                input_resolution[0],
+                embed_dim=dim,
+            )
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(
+        width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(int(width), dtype=dtype, device=device)
+        grid_h = torch.arange(int(height), dtype=dtype, device=device)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        if embed_dim % 4 != 0:
+            raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
 
     def forward(
         self,
@@ -630,6 +652,10 @@ def forward(
         always_partition: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
+
+        if self.positional_encoding is not None:
+            hidden_states = hidden_states + self.positional_encoding.to(hidden_states.dtype).to(hidden_states.device)
+
         for i, layer_module in enumerate(self.blocks):
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
@@ -639,9 +665,6 @@ def forward(
 
             hidden_states = layer_outputs[0]
 
-        if self.position_embeddings is not None:
-            hidden_states = hidden_states + self.position_embeddings[:, :hidden_states.size(1)]
-
         hidden_states_before_downsampling = hidden_states
         if self.downsample is not None:
             height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
diff --git a/surya/model/layout/config.py b/surya/model/layout/config.py
@@ -93,11 +93,11 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         drop_path_rate=0,
         hidden_act="gelu",
-        use_absolute_embeddings=True,
+        use_absolute_embeddings=False,
+        use_positional_embeddings=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_length=768,
-        starting_positional_embeddings=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -117,14 +117,14 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.drop_path_rate = drop_path_rate
         self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
+        self.use_absolute_embeddings = False
         self.layer_norm_eps = layer_norm_eps
         self.initializer_range = initializer_range
         # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.encoder_length = encoder_length
-        self.starting_positional_embeddings = starting_positional_embeddings
+        self.use_positional_embeddings = use_positional_embeddings
 
 
 class SuryaLayoutDecoderConfig(PretrainedConfig):
@@ -151,7 +151,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         bos_token_id=1,
-        size_token_id=2,
+        pause_token_id=2,
         img_size_bucket=100,
         hidden_activation="gelu_pytorch_tanh",
         rope_theta=10000.0,
@@ -206,7 +206,7 @@ def __init__(
         self.bbox_size = bbox_size
         self.label_count = label_count
         self.skew_scaler = skew_scaler
-        self.size_token_id = size_token_id
+        self.pause_token_id = pause_token_id
         self.img_size_bucket = img_size_bucket
         self.special_token_count = special_token_count
         self.layer_norm_eps = layer_norm_eps
diff --git a/surya/model/recognition/config.py b/surya/model/recognition/config.py
@@ -57,7 +57,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_length=256,
-        starting_positional_embeddings=False,
+        use_positional_embeddings=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -84,7 +84,7 @@ def __init__(
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.encoder_length = encoder_length
-        self.starting_positional_embeddings = starting_positional_embeddings
+        self.use_positional_embeddings = use_positional_embeddings
 
 
 class SuryaOCRDecoderConfig(PretrainedConfig):
diff --git a/surya/model/table_rec/config.py b/surya/model/table_rec/config.py
@@ -72,7 +72,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_length=1024,
-        starting_positional_embeddings=False,
+        use_positional_embeddings=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -99,7 +99,7 @@ def __init__(
         # this indicates the channel dimension after the last stage of the model
         self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
         self.encoder_length = encoder_length
-        self.starting_positional_embeddings = starting_positional_embeddings
+        self.use_positional_embeddings = use_positional_embeddings
 
 
 class SuryaTableRecDecoderConfig(PretrainedConfig):
diff --git a/surya/settings.py b/surya/settings.py
@@ -65,11 +65,11 @@ def TORCH_DEVICE_MODEL(self) -> str:
     RECOGNITION_ENCODER_BATCH_DIVISOR: int = 1 # Divisor for batch size in decoder
 
     # Layout
-    LAYOUT_MODEL_CHECKPOINT: str = "datalab-to/layout_order_hr"
-    LAYOUT_IMAGE_SIZE: Dict = {"height": 896, "width": 896}
+    LAYOUT_MODEL_CHECKPOINT: str = "datalab-to/layout_order_hr3"
+    LAYOUT_IMAGE_SIZE: Dict = {"height": 768, "width": 768}
     LAYOUT_BATCH_SIZE: Optional[int] = None
     LAYOUT_BENCH_DATASET_NAME: str = "vikp/publaynet_bench"
-    LAYOUT_MAX_BOXES: int = 75
+    LAYOUT_MAX_BOXES: int = 150
     COMPILE_LAYOUT: bool = False
 
     # Table Rec