detection working on xla

iammosespaulr · iammosespaulr · commit 5c286cc2ae70 · 2024-12-17T20:51:16.000Z
diff --git a/benchmark/detection.py b/benchmark/detection.py
@@ -17,6 +17,8 @@
 from tabulate import tabulate
 import datasets
 
+import torch
+import torch_xla.core.xla_model as xm
 
 def main():
     parser = argparse.ArgumentParser(description="Detect bboxes in a PDF.")
@@ -27,7 +29,7 @@ def main():
     parser.add_argument("--tesseract", action="store_true", help="Run tesseract as well.", default=False)
     args = parser.parse_args()
 
-    model = load_model()
+    model = load_model(device=xm.xla_device(), dtype=torch.bfloat16)
     processor = load_processor()
 
     if args.pdf_path is not None:
diff --git a/surya/detection.py b/surya/detection.py
@@ -97,7 +97,7 @@ def batch_detection(
         if current_shape != correct_shape:
             logits = F.interpolate(logits, size=correct_shape, mode='bilinear', align_corners=False)
 
-        logits = logits.cpu().detach().numpy().astype(np.float32)
+        logits = logits.to(torch.float32).cpu().detach().numpy()
         preds = []
         for i, (idx, height) in enumerate(zip(split_index, split_heights)):
             # If our current prediction length is below the image idx, that means we have a new image
diff --git a/surya/model/detection/model.py b/surya/model/detection/model.py
@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch_xla.core.xla_model as xm
 
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import SemanticSegmenterOutput
@@ -35,7 +36,7 @@ def load_model(checkpoint=settings.DETECTOR_MODEL_CHECKPOINT, device=settings.TO
         torch._dynamo.config.suppress_errors = False
 
         print(f"Compiling detection model {checkpoint} on device {device} with dtype {dtype}")
-        model = torch.compile(model)
+        model = torch.compile(model, backend='openxla')
 
     print(f"Loaded detection model {checkpoint} on device {device} with dtype {dtype}")
     return model
@@ -805,4 +806,4 @@ def forward(
             loss=None,
             logits=logits,
             hidden_states=encoder_hidden_states
-        )
+        )
diff --git a/surya/model/layout/model.py b/surya/model/layout/model.py
@@ -1,4 +1,5 @@
 import torch
+import torch_xla.core.xla_model as xm
 
 from surya.model.layout.encoderdecoder import SuryaLayoutModel
 from surya.model.layout.config import SuryaLayoutConfig, SuryaLayoutDecoderConfig, DonutSwinLayoutConfig
@@ -25,8 +26,8 @@ def load_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=settings.TORC
         torch._dynamo.config.suppress_errors = False
 
         print(f"Compiling layout model {checkpoint} on device {device} with dtype {dtype}")
-        model.encoder = torch.compile(model.encoder)
-        model.decoder = torch.compile(model.decoder)
+        model.encoder = torch.compile(model.encoder, backend='openxla')
+        model.decoder = torch.compile(model.decoder, backend='openxla')
 
     print(f"Loaded layout model {checkpoint} on device {device} with dtype {dtype}")
     return model
diff --git a/surya/model/recognition/model.py b/surya/model/recognition/model.py
@@ -1,6 +1,7 @@
 import warnings
 
 import torch
+import torch_xla.core.xla_model as xm
 
 warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")
 
@@ -52,9 +53,9 @@ def load_model(checkpoint=settings.RECOGNITION_MODEL_CHECKPOINT, device=settings
 
 
         print(f"Compiling recognition model {checkpoint} on device {device} with dtype {dtype}")
-        model.encoder = torch.compile(model.encoder)
-        model.decoder = torch.compile(model.decoder)
-        model.text_encoder = torch.compile(model.text_encoder)
+        model.encoder = torch.compile(model.encoder, backend='openxla')
+        model.decoder = torch.compile(model.decoder, backend='openxla')
+        model.text_encoder = torch.compile(model.text_encoder, backend='openxla')
 
     print(f"Loaded recognition model {checkpoint} on device {device} with dtype {dtype}")
     return model
diff --git a/surya/model/table_rec/model.py b/surya/model/table_rec/model.py
@@ -6,6 +6,7 @@
 from surya.settings import settings
 
 import torch
+import torch_xla.core.xla_model as xm
 
 
 def load_model(checkpoint=settings.TABLE_REC_MODEL_CHECKPOINT, device=settings.TORCH_DEVICE_MODEL, dtype=settings.MODEL_DTYPE) -> TableRecEncoderDecoderModel:
@@ -39,9 +40,9 @@ def load_model(checkpoint=settings.TABLE_REC_MODEL_CHECKPOINT, device=settings.T
 
         
         print(f"Compiling table recognition model {checkpoint} on device {device} with dtype {dtype}")
-        model.encoder = torch.compile(model.encoder)
-        model.decoder = torch.compile(model.decoder)
-        model.text_encoder = torch.compile(model.text_encoder)
+        model.encoder = torch.compile(model.encoder, backend='openxla')
+        model.decoder = torch.compile(model.decoder, backend='openxla')
+        model.text_encoder = torch.compile(model.text_encoder, backend='openxla')
 
     print(f"Loaded table recognition model {checkpoint} on device {device} with dtype {dtype}")
     return model