Fix slice pad

VikParuchuri · VikParuchuri · commit 07a715cac9d2 · 2024-02-02T14:55:51.000-08:00
diff --git a/benchmark/recognition.py b/benchmark/recognition.py
@@ -7,6 +7,8 @@
 from surya.ocr import run_ocr, run_recognition
 from surya.postprocessing.text import draw_text_on_image
 from surya.settings import settings
+from surya.languages import CODE_TO_LANGUAGE, is_arabic
+import arabic_reshaper
 import os
 import datasets
 import json
@@ -46,9 +48,12 @@ def main():
 
     image_scores = defaultdict(list)
     for idx, (pred, ref_text, lang) in enumerate(zip(predictions_by_image, line_text, lang_list)):
+        if any(is_arabic(l) for l in lang):
+            ref_text = [arabic_reshaper.reshape(t) for t in ref_text]
+            pred["text_lines"] = [arabic_reshaper.reshape(t) for t in pred["text_lines"]]
         image_score = overlap_score(pred["text_lines"], ref_text)
         for l in lang:
-            image_scores[l].append(image_score)
+            image_scores[CODE_TO_LANGUAGE[l]].append(image_score)
 
     image_avgs = {l: sum(scores) / len(scores) for l, scores in image_scores.items()}
     print(image_avgs)
diff --git a/ocr_text.py b/ocr_text.py
@@ -10,6 +10,7 @@
 from surya.ocr import run_ocr
 from surya.postprocessing.text import draw_text_on_image
 from surya.settings import settings
+from surya.languages import LANGUAGE_TO_CODE, CODE_TO_LANGUAGE
 import os
 
 
@@ -23,7 +24,14 @@ def main():
     parser.add_argument("--lang", type=str, help="Language to use for OCR. Comma separate for multiple.", default="en")
     args = parser.parse_args()
 
+    # Split and validate language codes
     langs = args.lang.split(",")
+    for i in range(len(langs)):
+        if langs[i] in LANGUAGE_TO_CODE:
+            langs[i] = LANGUAGE_TO_CODE[langs[i]]
+        if langs[i] not in CODE_TO_LANGUAGE:
+            raise ValueError(f"Language code {langs[i]} not found.")
+
     det_processor = load_detection_processor()
     det_model = load_detection_model()
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ pymupdf = "^1.23.8"
 snakeviz = "^2.2.0"
 datasets = "^2.16.1"
 rapidfuzz = "^3.6.1"
+arabic-reshaper = "^3.0.0"
 
 [tool.poetry.scripts]
 surya_detect = "detect_text:main"
diff --git a/surya/input/processing.py b/surya/input/processing.py
@@ -75,15 +75,31 @@ def slice_polys_from_image(image: Image.Image, polys):
 
 
 def slice_and_pad_poly(image: Image.Image, coordinates):
-    coordinates = [(corner[0], corner[1]) for corner in coordinates]
-
+    # Create a mask for the polygon
     mask = Image.new('L', image.size, 0)
+
+    # coordinates must be in tuple form for PIL
+    coordinates = [(corner[0], corner[1]) for corner in coordinates]
     ImageDraw.Draw(mask).polygon(coordinates, outline=1, fill=1)
-    bbox = mask.getbbox()
-    mask = mask.crop(bbox)
-    cropped_image = image.crop(bbox)
-    mask = mask.convert('1')
-    rectangle = Image.new('RGB', cropped_image.size, 'white')
-    rectangle.paste(cropped_image, (0, 0), mask)
+    mask = np.array(mask)
+
+    # Extract the polygonal area from the image
+    polygon_image = np.array(image)
+    polygon_image[~mask] = 0
+    polygon_image = Image.fromarray(polygon_image)
+
+    bbox_image = Image.new('L', image.size, 0)
+    ImageDraw.Draw(bbox_image).polygon(coordinates, outline=1, fill=1)
+    bbox = bbox_image.getbbox()
+
+    rectangle = Image.new('RGB', (bbox[2] - bbox[0], bbox[3] - bbox[1]), 'white')
+
+    # Paste the polygon into the rectangle
+    polygon_center = (bbox[2] + bbox[0]) // 2, (bbox[3] + bbox[1]) // 2
+    rectangle_center = rectangle.width // 2, rectangle.height // 2
+    paste_position = (rectangle_center[0] - polygon_center[0] + bbox[0],
+                      rectangle_center[1] - polygon_center[1] + bbox[1])
+    rectangle.paste(polygon_image.crop(bbox), paste_position)
 
     return rectangle
+
diff --git a/surya/languages.py b/surya/languages.py
@@ -0,0 +1,101 @@
+CODE_TO_LANGUAGE = {
+    'af': 'Afrikaans',
+    'am': 'Amharic',
+    'ar': 'Arabic',
+    'as': 'Assamese',
+    'az': 'Azerbaijani',
+    'be': 'Belarusian',
+    'bg': 'Bulgarian',
+    'bn': 'Bangla',
+    'br': 'Breton',
+    'bs': 'Bosnian',
+    'ca': 'Catalan',
+    'cs': 'Czech',
+    'cy': 'Welsh',
+    'da': 'Danish',
+    'de': 'German',
+    'el': 'Greek',
+    'en': 'English',
+    'eo': 'Esperanto',
+    'es': 'Spanish',
+    'et': 'Estonian',
+    'eu': 'Basque',
+    'fa': 'Persian',
+    'fi': 'Finnish',
+    'fr': 'French',
+    'fy': 'Western Frisian',
+    'ga': 'Irish',
+    'gd': 'Scottish Gaelic',
+    'gl': 'Galician',
+    'gu': 'Gujarati',
+    'ha': 'Hausa',
+    'he': 'Hebrew',
+    'hi': 'Hindi',
+    'hr': 'Croatian',
+    'hu': 'Hungarian',
+    'hy': 'Armenian',
+    'id': 'Indonesian',
+    'is': 'Icelandic',
+    'it': 'Italian',
+    'ja': 'Japanese',
+    'jv': 'Javanese',
+    'ka': 'Georgian',
+    'kk': 'Kazakh',
+    'km': 'Khmer',
+    'kn': 'Kannada',
+    'ko': 'Korean',
+    'ku': 'Kurdish',
+    'ky': 'Kyrgyz',
+    'la': 'Latin',
+    'lo': 'Lao',
+    'lt': 'Lithuanian',
+    'lv': 'Latvian',
+    'mg': 'Malagasy',
+    'mk': 'Macedonian',
+    'ml': 'Malayalam',
+    'mn': 'Mongolian',
+    'mr': 'Marathi',
+    'ms': 'Malay',
+    'my': 'Burmese',
+    'ne': 'Nepali',
+    'nl': 'Dutch',
+    'no': 'Norwegian',
+    'om': 'Oromo',
+    'or': 'Odia',
+    'pa': 'Punjabi',
+    'pl': 'Polish',
+    'ps': 'Pashto',
+    'pt': 'Portuguese',
+    'ro': 'Romanian',
+    'ru': 'Russian',
+    'sa': 'Sanskrit',
+    'sd': 'Sindhi',
+    'si': 'Sinhala',
+    'sk': 'Slovak',
+    'sl': 'Slovenian',
+    'so': 'Somali',
+    'sq': 'Albanian',
+    'sr': 'Serbian',
+    'su': 'Sundanese',
+    'sv': 'Swedish',
+    'sw': 'Swahili',
+    'ta': 'Tamil',
+    'te': 'Telugu',
+    'th': 'Thai',
+    'tl': 'Tagalog',
+    'tr': 'Turkish',
+    'ug': 'Uyghur',
+    'uk': 'Ukrainian',
+    'ur': 'Urdu',
+    'uz': 'Uzbek',
+    'vi': 'Vietnamese',
+    'xh': 'Xhosa',
+    'yi': 'Yiddish',
+    'zh': 'Chinese'
+}
+
+LANGUAGE_TO_CODE = {v: k for k, v in CODE_TO_LANGUAGE.items()}
+
+
+def is_arabic(lang_code):
+    return lang_code in ["ar", "fa", "ps", "ug", "ur"]
diff --git a/surya/ocr.py b/surya/ocr.py
@@ -57,7 +57,7 @@ def run_ocr(images: List[Image.Image], langs: List[List[str]], det_model, det_pr
     slice_map = []
     all_slices = []
     all_langs = []
-    for idx, (image, det_pred, lang) in tqdm(enumerate(zip(images, det_predictions, langs)), desc="Slicing images"):
+    for idx, (image, det_pred, lang) in enumerate(zip(images, det_predictions, langs)):
         slices = slice_polys_from_image(image, det_pred["polygons"])
         slice_map.append(len(slices))
         all_slices.extend(slices)
@@ -80,4 +80,4 @@ def run_ocr(images: List[Image.Image], langs: List[List[str]], det_model, det_pr
             "language": lang
         })
 
-    return predictions_by_image
+    return predictions_by_image
diff --git a/surya/recognition.py b/surya/recognition.py
@@ -19,15 +19,19 @@ def get_batch_size():
 
 def batch_recognition(images: List, languages: List[List[str]], model, processor):
     assert all([isinstance(image, Image.Image) for image in images])
+    assert len(images) == len(languages)
     batch_size = get_batch_size()
 
     images = [image.convert("RGB") for image in images]
-    model_inputs = processor(text=[""] * len(languages), images=images, lang=languages)
 
     output_text = []
-    for i in tqdm(range(0, len(model_inputs["pixel_values"]), batch_size), desc="Recognizing Text"):
-        batch_langs = model_inputs["langs"][i:i+batch_size]
-        batch_pixel_values = model_inputs["pixel_values"][i:i+batch_size]
+    for i in tqdm(range(0, len(images), batch_size), desc="Recognizing Text"):
+        batch_langs = languages[i:i+batch_size]
+        batch_images = images[i:i+batch_size]
+        model_inputs = processor(text=[""] * len(batch_langs), images=batch_images, lang=batch_langs)
+
+        batch_pixel_values = model_inputs["pixel_values"]
+        batch_langs = model_inputs["langs"]
         batch_decoder_input = [[model.config.decoder_start_token_id] + lang for lang in batch_langs]
 
         batch_langs = torch.from_numpy(np.array(batch_langs, dtype=np.int64)).to(model.device)