Skip to content

Commit 87e4278

Browse files
Update accelerator options for easy ocr
Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
1 parent 19ba945 commit 87e4278

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

src/instructlab/sdg/utils/chunkers.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
# Third Party
1010
from datasets import Dataset
11+
from docling.accelerator import decide_device
1112
from docling.chunking import HybridChunker
1213
from docling.datamodel.base_models import InputFormat
1314
from docling.datamodel.document import ConversionResult
@@ -53,14 +54,17 @@ def resolve_ocr_options() -> OcrOptions:
5354
ocr_options = EasyOcrOptions()
5455
# Keep easyocr models on the CPU instead of GPU
5556
ocr_options.use_gpu = False
57+
accelerator_options = AcceleratorOptions(device=decide_device(None))
5658
# triggers torch loading, import lazily
5759
# pylint: disable=import-outside-toplevel
5860
# Third Party
5961
from docling.models.easyocr_model import EasyOcrModel
6062

61-
accelerator_options = AcceleratorOptions()
6263
_ = EasyOcrModel(
63-
True, None, ocr_options, accelerator_options=accelerator_options
64+
enabled=True,
65+
artifacts_path=None,
66+
options=ocr_options,
67+
accelerator_options=accelerator_options,
6468
)
6569
return ocr_options
6670
except ImportError:
@@ -193,7 +197,9 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
193197
chunk_iter = chunker.chunk(
194198
dl_doc=data
195199
) # Use hybrid chunker to chunk the document
196-
chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter]
200+
201+
chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
202+
197203
fused_texts = self.fuse_texts(chunks, 200)
198204

199205
num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
@@ -317,11 +323,11 @@ def export_documents(self, converted_docs: Iterable[ConversionResult]):
317323

318324
# Export Deep Search document JSON format:
319325
with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp:
320-
fp.write(json.dumps(doc.legacy_document.export_to_dict()))
326+
fp.write(json.dumps(doc.document.export_to_dict()))
321327

322328
# Export Markdown format:
323329
with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp:
324-
fp.write(doc.legacy_document.export_to_markdown())
330+
fp.write(doc.document.export_to_markdown())
325331
else:
326332
logger.info(f"Document {doc.input.file} failed to convert.")
327333
failure_count += 1

0 commit comments

Comments
 (0)