VikParuchuri
diff --git a/‎README.md
+72-14 b/‎README.md
+72-14
diff --git a/‎benchmark/detection.py
+4-4 b/‎benchmark/detection.py
+4-4
diff --git a/‎benchmark/recognition.py
+81 b/‎benchmark/recognition.py
+81
diff --git a/‎benchmark/scoring.py
+20 b/‎benchmark/scoring.py
+20
diff --git a/‎demo_app.py
+3-3 b/‎demo_app.py
+3-3
diff --git a/‎detect_text.py
+5-4 b/‎detect_text.py
+5-4
@@ -2,11 +2,11 @@
 
 Surya is a multilingual document OCR toolkit.  It can do:
 
-- Accurate line-level text detection
-- Text recognition (coming soon)
+- Accurate line-level text detection in any language
+- Text recognition in 90+ languages
 - Table and chart detection (coming soon)
 
-It works on a range of documents and languages (see [usage](#usage) and [benchmarks](#benchmarks) for more details).
+It works on a range of documents (see [usage](#usage) and [benchmarks](#benchmarks) for more details).
 
 ![New York Times Article Example](static/images/excerpt.png)
 
@@ -46,6 +46,62 @@ Model weights will automatically download the first time you run surya.
 - Inspect the settings in `surya/settings.py`.  You can override any settings with environment variables.
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`. Note that the `mps` device has a bug (on the [Apple side](https://github.com/pytorch/pytorch/issues/84936)) that may prevent it from working properly.
 
+## OCR (text recognition)
+
+You can detect text lines in an image, pdf, or folder of images/pdfs with the following command.  This will write out a json file with the detected text and bboxes, and optionally save images of the reconstructed page.
+
+```
+surya_ocr DATA_PATH --images --lang hi,en
+```
+
+- `DATA_PATH` can be an image, pdf, or folder of images/pdfs
+- `--lang` specifies the language(s) to use for OCR.  You can comma separate multiple languages. Use the language name or two-letter ISO code from [here](https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes).  Surya supports the 90+ languages found in `surya/languages.py`.
+- `--lang_file` if you want to use a different language for different PDFs/images, you can specify languages here.  The format is a JSON dict with the keys being filenames and the values as a list, like `{"file1.pdf": ["en", "hi"], "file2.pdf": ["en"]}`.
+- `--images` will save images of the pages and detected text lines (optional)
+- `--results_dir` specifies the directory to save results to instead of the default
+- `--max` specifies the maximum number of pages to process if you don't want to process everything
+- `--start_page` specifies the page number to start processing from
+
+The `results.json` file will contain these keys for each page of the input document(s):
+
+- `text_lines` - the detected text in each line
+- `polys` - the polygons for each detected text line in (x1, y1), (x2, y2), (x3, y3), (x4, y4) format.  The points are in clockwise order from the top left.
+- `bboxes` - the axis-aligned rectangles for each detected text line in (x1, y1, x2, y2) format.  (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner.
+- `language` - the languages specified for the page
+- `name` - the name of the file
+- `page_number` - the page number in the file
+
+**Performance tips**
+
+Setting the `RECOGNITION_BATCH_SIZE` env var properly will make a big difference when using a GPU.  Each batch item will use `40MB` of VRAM, so very high batch sizes are possible.  The default is a batch size `256`, which will use about 10GB of VRAM.
+
+Depending on your CPU core count, `RECOGNITION_BATCH_SIZE` might make a difference there too - the default CPU batch size is `32`.
+
+
+### From Python
+
+You can also do OCR from code with:
+
+```
+from PIL import Image
+from surya.ocr import run_ocr
+from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
+from surya.model.recognition.model import load_model as load_rec_model
+from surya.model.recognition.processor import load_processor as load_rec_processor
+
+image = Image.open(IMAGE_PATH)
+langs = ["en"] # Replace with your languages
+
+det_processor = load_det_processor()
+det_model = load_det_model()
+
+rec_model = load_rec_model()
+rec_processor = load_rec_processor()
+
+predictions = run_ocr([image], langs, det_model, det_processor, rec_model, rec_processor)
+```
+
+
 ## Text line detection
 
 You can detect text lines in an image, pdf, or folder of images/pdfs with the following command.  This will write out a json file with the detected bboxes, and optionally save images of the pages with the bboxes.
@@ -75,26 +131,23 @@ Depending on your CPU core count, `DETECTOR_BATCH_SIZE` might make a difference
 
 You can adjust `DETECTOR_NMS_THRESHOLD` and `DETECTOR_TEXT_THRESHOLD` if you don't get good results.  Try lowering them to detect more text, and vice versa.
 
+
 ### From Python
 
 You can also do text detection from code with:
 
 ```
 from PIL import Image
-from surya.detection import batch_inference
+from surya.detection import batch_detection
 from surya.model.segformer import load_model, load_processor
 
 image = Image.open(IMAGE_PATH)
 model, processor = load_model(), load_processor()
 
 # predictions is a list of dicts, one per image
-predictions = batch_inference([image], model, processor)
+predictions = batch_detection([image], model, processor)
 ```
 
-## Text recognition
-
-Coming soon.
-
 ## Table and chart detection
 
 Coming soon.
@@ -113,10 +166,14 @@ If you want to develop surya, you can install it manually:
 - This is specialized for document OCR.  It will likely not work on photos or other images.
 - It is for printed text, not handwriting.
 - The model has trained itself to ignore advertisements.
-- This has worked for every language I've tried, but languages with very different character sets may not work well.
+- You can find language support for OCR in `surya/languages.py`.  Text detection should work with any language.
 
 # Benchmarks
 
+## OCR
+
+Coming soon.
+
 ## Text line detection
 
 ![Benchmark chart](static/images/benchmark_chart_small.png)
@@ -168,13 +225,13 @@ python benchmark/detection.py --max 256
 
 # Training
 
-This was trained on 4x A6000s for about 3 days.  It used a diverse set of images as training data.  It was trained from scratch using a modified segformer architecture that reduces inference RAM requirements.
+The text detection was trained on 4x A6000s for about 3 days.  It used a diverse set of images as training data.  It was trained from scratch using a modified segformer architecture that reduces inference RAM requirements.
 
-# Commercial usage
+Text recognition was trained on 4x A6000s for 2 weeks.  It was trained using a modified donut model (GQA, MoE layer, UTF-16 decoding, layer config changes).
 
-**Text detection**
+# Commercial usage
 
-The text detection model was trained from scratch, so it's okay for commercial usage.  The weights are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period.
+The text detection and OCR models were trained from scratch, so they're okay for commercial usage.  The weights are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period.
 
 If you want to remove the GPL license requirements for inference or use the weights commercially over the revenue limit, please contact me at surya@vikas.sh for dual licensing.
 
@@ -183,6 +240,7 @@ If you want to remove the GPL license requirements for inference or use the weig
 This work would not have been possible without amazing open source AI work:
 
 - [Segformer](https://arxiv.org/pdf/2105.15203.pdf) from NVIDIA
+- [Donut](https://github.com/clovaai/donut) from Naver
 - [transformers](https://github.com/huggingface/transformers) from huggingface
 - [CRAFT](https://github.com/clovaai/CRAFT-pytorch), a great scene text detection model
 
 
@@ -6,9 +6,9 @@
 from surya.benchmark.bbox import get_pdf_lines
 from surya.benchmark.metrics import precision_recall
 from surya.benchmark.tesseract import tesseract_parallel
-from surya.model.segformer import load_model, load_processor
+from surya.model.detection.segformer import load_model, load_processor
 from surya.input.processing import open_pdf, get_page_images
-from surya.detection import batch_inference
+from surya.detection import batch_detection
 from surya.postprocessing.heatmap import draw_polys_on_image
 from surya.postprocessing.util import rescale_bbox
 from surya.settings import settings
@@ -44,7 +44,7 @@ def main():
     else:
         pathname = "doclaynet_bench"
         # These have already been shuffled randomly, so sampling from the start is fine
-        dataset = datasets.load_dataset(settings.BENCH_DATASET_NAME, split=f"train[:{args.max}]")
+        dataset = datasets.load_dataset(settings.DETECTOR_BENCH_DATASET_NAME, split=f"train[:{args.max}]")
         images = list(dataset["image"])
         images = [i.convert("RGB") for i in images]
         correct_boxes = []
@@ -54,7 +54,7 @@ def main():
             correct_boxes.append([rescale_bbox(b, (1000, 1000), img_size) for b in boxes])
 
     start = time.time()
-    predictions = batch_inference(images, model, processor)
+    predictions = batch_detection(images, model, processor)
     surya_time = time.time() - start
 
     start = time.time()
 
@@ -0,0 +1,81 @@
+import argparse
+from collections import defaultdict
+
+from benchmark.scoring import overlap_score
+from surya.model.recognition.model import load_model as load_recognition_model
+from surya.model.recognition.processor import load_processor as load_recognition_processor
+from surya.ocr import run_ocr, run_recognition
+from surya.postprocessing.text import draw_text_on_image
+from surya.settings import settings
+from surya.languages import CODE_TO_LANGUAGE, is_arabic
+import arabic_reshaper
+import os
+import datasets
+import json
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Detect bboxes in a PDF.")
+    parser.add_argument("--results_dir", type=str, help="Path to JSON file with OCR results.", default=os.path.join(settings.RESULT_DIR, "benchmark"))
+    parser.add_argument("--max", type=int, help="Maximum number of pdf pages to OCR.", default=None)
+    parser.add_argument("--debug", action="store_true", help="Run in debug mode.", default=False)
+    args = parser.parse_args()
+
+    rec_model = load_recognition_model()
+    rec_processor = load_recognition_processor()
+
+    split = "train"
+    if args.max:
+        split = f"train[:{args.max}]"
+
+    dataset = datasets.load_dataset(settings.RECOGNITION_BENCH_DATASET_NAME, split=split)
+    images = list(dataset["image"])
+    images = [i.convert("RGB") for i in images]
+    bboxes = list(dataset["bboxes"])
+    line_text = list(dataset["text"])
+    languages = list(dataset["language"])
+
+    print(f"Loaded {len(images)} images. Running OCR...")
+
+    lang_list = []
+    for l in languages:
+        if not isinstance(l, list):
+            lang_list.append([l])
+        else:
+            lang_list.append(l)
+
+    predictions_by_image = run_recognition(images, lang_list, rec_model, rec_processor, bboxes=bboxes)
+
+    image_scores = defaultdict(list)
+    for idx, (pred, ref_text, lang) in enumerate(zip(predictions_by_image, line_text, lang_list)):
+        if any(is_arabic(l) for l in lang):
+            ref_text = [arabic_reshaper.reshape(t) for t in ref_text]
+            pred["text_lines"] = [arabic_reshaper.reshape(t) for t in pred["text_lines"]]
+        image_score = overlap_score(pred["text_lines"], ref_text)
+        for l in lang:
+            image_scores[CODE_TO_LANGUAGE[l]].append(image_score)
+
+    image_avgs = {l: sum(scores) / len(scores) for l, scores in image_scores.items()}
+    print(image_avgs)
+
+    result_path = os.path.join(args.results_dir, "rec_bench")
+    os.makedirs(result_path, exist_ok=True)
+
+    with open(os.path.join(result_path, "results.json"), "w+") as f:
+        json.dump(image_scores, f)
+
+    if args.debug:
+        for idx, (image, pred, ref_text, bbox, lang) in enumerate(zip(images, predictions_by_image, line_text, bboxes, lang_list)):
+            pred_image_name = f"{'_'.join(lang)}_{idx}_pred.png"
+            ref_image_name = f"{'_'.join(lang)}_{idx}_ref.png"
+            pred_image = draw_text_on_image(bbox, pred["text_lines"], image.size)
+            pred_image.save(os.path.join(result_path, pred_image_name))
+            ref_image = draw_text_on_image(bbox, ref_text, image.size)
+            ref_image.save(os.path.join(result_path, ref_image_name))
+            image.save(os.path.join(result_path, f"{'_'.join(lang)}_{idx}_image.png"))
+
+    print(f"Wrote results to {result_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,20 @@
+import math
+from rapidfuzz import fuzz
+
+
+def overlap_score(pred_lines, reference_lines):
+    line_scores = []
+    line_weights = []
+    for i, pred_line in enumerate(pred_lines):
+        max_score = 0
+        line_weight = 1
+        for j, ref_line in enumerate(reference_lines):
+            score = fuzz.ratio(pred_line, ref_line, score_cutoff=20) / 100
+            if score > max_score:
+                max_score = score
+                line_weight = math.sqrt(len(ref_line))
+        line_scores.append(max_score)
+        line_weights.append(line_weight)
+    line_scores = [line_scores[i] * line_weights[i] for i in range(len(line_scores))]
+
+    return sum(line_scores) / sum(line_weights)
@@ -1,6 +1,6 @@
 import gradio as gr
-from surya.detection import batch_inference
-from surya.model.segformer import load_model, load_processor
+from surya.detection import batch_detection
+from surya.model.detection.segformer import load_model, load_processor
 from surya.postprocessing.heatmap import draw_polys_on_image
 
 model, processor = load_model(), load_processor()
@@ -18,7 +18,7 @@
 """.strip()
 
 def text_detection(img):
-    preds = batch_inference([img], model, processor)[0]
+    preds = batch_detection([img], model, processor)[0]
     img = draw_polys_on_image(preds["polygons"], img)
     return img, preds
 
 
@@ -4,17 +4,18 @@
 from collections import defaultdict
 
 from surya.input.load import load_from_folder, load_from_file
-from surya.model.segformer import load_model, load_processor
-from surya.detection import batch_inference
+from surya.model.detection.segformer import load_model, load_processor
+from surya.detection import batch_detection
 from surya.postprocessing.affinity import draw_lines_on_image
 from surya.postprocessing.heatmap import draw_polys_on_image
 from surya.settings import settings
 import os
+from tqdm import tqdm
 
 
 def main():
     parser = argparse.ArgumentParser(description="Detect bboxes in an input file or folder (PDFs or image).")
-    parser.add_argument("input_path", type=str, help="Path to pdf or image file to detect bboxes in.")
+    parser.add_argument("input_path", type=str, help="Path to pdf or image file or folder to detect bboxes in.")
     parser.add_argument("--results_dir", type=str, help="Path to JSON file with OCR results.", default=os.path.join(settings.RESULT_DIR, "surya"))
     parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
     parser.add_argument("--images", action="store_true", help="Save images of detected bboxes.", default=False)
@@ -31,7 +32,7 @@ def main():
         images, names = load_from_file(args.input_path, args.max)
         folder_name = os.path.basename(args.input_path).split(".")[0]
 
-    predictions = batch_inference(images, model, processor)
+    predictions = batch_detection(images, model, processor)
     result_path = os.path.join(args.results_dir, folder_name)
     os.makedirs(result_path, exist_ok=True)