|
8 | 8 |
|
9 | 9 | # Third Party
|
10 | 10 | from datasets import Dataset
|
| 11 | +from docling.accelerator import decide_device |
11 | 12 | from docling.chunking import HybridChunker
|
12 | 13 | from docling.datamodel.base_models import InputFormat
|
13 | 14 | from docling.datamodel.document import ConversionResult
|
@@ -53,14 +54,17 @@ def resolve_ocr_options() -> OcrOptions:
|
53 | 54 | ocr_options = EasyOcrOptions()
|
54 | 55 | # Keep easyocr models on the CPU instead of GPU
|
55 | 56 | ocr_options.use_gpu = False
|
| 57 | + accelerator_options = AcceleratorOptions(device=decide_device(None)) |
56 | 58 | # triggers torch loading, import lazily
|
57 | 59 | # pylint: disable=import-outside-toplevel
|
58 | 60 | # Third Party
|
59 | 61 | from docling.models.easyocr_model import EasyOcrModel
|
60 | 62 |
|
61 |
| - accelerator_options = AcceleratorOptions() |
62 | 63 | _ = EasyOcrModel(
|
63 |
| - True, None, ocr_options, accelerator_options=accelerator_options |
| 64 | + enabled=True, |
| 65 | + artifacts_path=None, |
| 66 | + options=ocr_options, |
| 67 | + accelerator_options=accelerator_options, |
64 | 68 | )
|
65 | 69 | return ocr_options
|
66 | 70 | except ImportError:
|
@@ -193,7 +197,9 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
|
193 | 197 | chunk_iter = chunker.chunk(
|
194 | 198 | dl_doc=data
|
195 | 199 | ) # Use hybrid chunker to chunk the document
|
196 |
| - chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter] |
| 200 | + |
| 201 | + chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter] |
| 202 | + |
197 | 203 | fused_texts = self.fuse_texts(chunks, 200)
|
198 | 204 |
|
199 | 205 | num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
|
@@ -317,11 +323,11 @@ def export_documents(self, converted_docs: Iterable[ConversionResult]):
|
317 | 323 |
|
318 | 324 | # Export Deep Search document JSON format:
|
319 | 325 | with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp:
|
320 |
| - fp.write(json.dumps(doc.legacy_document.export_to_dict())) |
| 326 | + fp.write(json.dumps(doc.document.export_to_dict())) |
321 | 327 |
|
322 | 328 | # Export Markdown format:
|
323 | 329 | with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp:
|
324 |
| - fp.write(doc.legacy_document.export_to_markdown()) |
| 330 | + fp.write(doc.document.export_to_markdown()) |
325 | 331 | else:
|
326 | 332 | logger.info(f"Document {doc.input.file} failed to convert.")
|
327 | 333 | failure_count += 1
|
|
0 commit comments