Skip to content

Commit c074035

Browse files
NastyBogetoksidgysunveil
authored
new version 2.3.2 (#513)
Co-authored-by: Oksana Belyaeva <belyaeva@ispras.ru> Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru>
1 parent 6a60e97 commit c074035

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+721
-1024
lines changed

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.3.1
1+
2.3.2

dedoc/api/api_args.py

-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ class QueryParameters:
2222
# tables handling
2323
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
2424
table_type: str = Form("", description="Pipeline mode for table recognition")
25-
orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
26-
orient_cell_angle: str = Form("90", enum=["90", "270"],
27-
description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
2825

2926
# pdf handling
3027
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],

dedoc/api/api_utils.py

+10-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Dict, Iterator, List, Optional, Set
22

3+
from dedoc.api.schema import LineMetadata, ParsedDocument, Table, TreeNode
34
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
45
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
56
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
@@ -10,10 +11,6 @@
1011
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
1112
from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation
1213
from dedoc.data_structures.hierarchy_level import HierarchyLevel
13-
from dedoc.data_structures.line_metadata import LineMetadata
14-
from dedoc.data_structures.parsed_document import ParsedDocument
15-
from dedoc.data_structures.table import Table
16-
from dedoc.data_structures.tree_node import TreeNode
1714
from dedoc.extensions import converted_mimes, recognized_mimes
1815

1916

@@ -39,7 +36,7 @@ def _node2tree(paragraph: TreeNode, depth: int, depths: Set[int] = None) -> str:
3936
space = "".join(space)
4037
node_result = []
4138

42-
node_result.append(f" {space} {paragraph.metadata.hierarchy_level.line_type}&nbsp{paragraph.node_id} ")
39+
node_result.append(f" {space} {paragraph.metadata.paragraph_type}&nbsp{paragraph.node_id} ")
4340
for text in __prettify_text(paragraph.text):
4441
space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol]
4542
space = "".join(space)
@@ -98,7 +95,7 @@ def json2tree(paragraph: TreeNode) -> str:
9895
depths = {d for d in depths if d <= depth}
9996
space = [space_symbol] * 4 * (depth - 1) + 4 * ["-"]
10097
space = __add_vertical_line(depths, space)
101-
node_result.append(f"<p> <tt> <em> {space} {node.metadata.hierarchy_level.line_type}&nbsp{node.node_id} </em> </tt> </p>")
98+
node_result.append(f"<p> <tt> <em> {space} {node.metadata.paragraph_type}&nbsp{node.node_id} </em> </tt> </p>")
10299
for text in __prettify_text(node.text):
103100
space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol]
104101
space = __add_vertical_line(depths, space)
@@ -136,14 +133,14 @@ def json2html(text: str,
136133

137134
ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs)
138135

139-
if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]:
136+
if paragraph.metadata.paragraph_type in [HierarchyLevel.header, HierarchyLevel.root]:
140137
ptext = f"<strong>{ptext.strip()}</strong>"
141-
elif paragraph.metadata.hierarchy_level.line_type == HierarchyLevel.list_item:
138+
elif paragraph.metadata.paragraph_type == HierarchyLevel.list_item:
142139
ptext = f"<em>{ptext.strip()}</em>"
143140
else:
144141
ptext = ptext.strip()
145142

146-
ptext = f'<p> {"&nbsp;" * tabs} {ptext} <sub> id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type} </sub></p>'
143+
ptext = f'<p> {"&nbsp;" * tabs} {ptext} <sub> id = {paragraph.node_id} ; type = {paragraph.metadata.paragraph_type} </sub></p>'
147144
if hasattr(paragraph.metadata, "uid"):
148145
ptext = f'<div id="{paragraph.metadata.uid}">{ptext}</div>'
149146
text += ptext
@@ -259,11 +256,10 @@ def table2html(table: Table, table2id: Dict[str, int]) -> str:
259256
text += ' style="display: none" '
260257
cell_node = TreeNode(
261258
node_id="0",
262-
text=cell.get_text(),
263-
annotations=cell.get_annotations(),
264-
metadata=LineMetadata(page_id=table.metadata.page_id, line_id=0),
265-
subparagraphs=[],
266-
parent=None
259+
text="\n".join([line.text for line in cell.lines]),
260+
annotations=cell.lines[0].annotations if cell.lines else [],
261+
metadata=LineMetadata(page_id=0, line_id=0, paragraph_type=HierarchyLevel.raw_text),
262+
subparagraphs=[]
267263
)
268264
text += f' colspan="{cell.colspan}" rowspan="{cell.rowspan}">{__annotations2html(cell_node, {}, {})}</td>\n'
269265

dedoc/api/cancellation.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import logging
2+
from contextlib import asynccontextmanager
3+
4+
from anyio import create_task_group
5+
from fastapi import Request
6+
7+
8+
@asynccontextmanager
9+
async def cancel_on_disconnect(request: Request, logger: logging.Logger) -> None:
10+
"""
11+
Async context manager for async code that needs to be cancelled if client disconnects prematurely.
12+
The client disconnect is monitored through the Request object.
13+
14+
Source: https://github.com/dorinclisu/runner-with-api
15+
See discussion: https://github.com/fastapi/fastapi/discussions/8805
16+
"""
17+
async with create_task_group() as task_group:
18+
async def watch_disconnect() -> None:
19+
while True:
20+
message = await request.receive()
21+
22+
if message["type"] == "http.disconnect":
23+
client = f"{request.client.host}:{request.client.port}" if request.client else "-:-"
24+
logger.warning(f"{client} - `{request.method} {request.url.path}` 499 DISCONNECTED")
25+
26+
task_group.cancel_scope.cancel()
27+
break
28+
29+
task_group.start_soon(watch_disconnect)
30+
31+
try:
32+
yield
33+
finally:
34+
task_group.cancel_scope.cancel()

dedoc/api/dedoc_api.py

+15-25
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
import base64
21
import dataclasses
32
import importlib
43
import json
54
import os
65
import tempfile
7-
import traceback
86
from typing import Optional
97

108
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
@@ -15,24 +13,23 @@
1513
import dedoc.version
1614
from dedoc.api.api_args import QueryParameters
1715
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
16+
from dedoc.api.process_handler import ProcessHandler
1817
from dedoc.api.schema.parsed_document import ParsedDocument
1918
from dedoc.common.exceptions.dedoc_error import DedocError
2019
from dedoc.common.exceptions.missing_file_error import MissingFileError
2120
from dedoc.config import get_config
22-
from dedoc.dedoc_manager import DedocManager
2321
from dedoc.utils.utils import save_upload_file
2422

2523
config = get_config()
24+
logger = config["logger"]
2625
PORT = config["api_port"]
2726
static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web")
2827
static_files_dirs = config.get("static_files_dirs")
2928

3029
app = FastAPI()
3130
app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web")
32-
3331
module_api_args = importlib.import_module(config["import_path_init_api_args"])
34-
logger = config["logger"]
35-
manager = DedocManager(config=config)
32+
process_handler = ProcessHandler(logger=logger)
3633

3734

3835
@app.get("/")
@@ -62,27 +59,20 @@ def _get_static_file_path(request: Request) -> str:
6259
return os.path.abspath(os.path.join(directory, file))
6360

6461

65-
def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None:
66-
for attachment in document_tree.attachments:
67-
with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
68-
attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
69-
70-
7162
@app.post("/upload", response_model=ParsedDocument)
72-
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
63+
async def upload(request: Request, file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
7364
parameters = dataclasses.asdict(query_params)
7465
if not file or file.filename == "":
7566
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__)
7667

77-
return_format = str(parameters.get("return_format", "json")).lower()
78-
7968
with tempfile.TemporaryDirectory() as tmpdir:
8069
file_path = save_upload_file(file, tmpdir)
81-
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
70+
document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir)
8271

83-
if return_format == "html":
84-
__add_base64_info_to_attachments(document_tree, tmpdir)
72+
if document_tree is None:
73+
return JSONResponse(status_code=499, content={})
8574

75+
return_format = str(parameters.get("return_format", "json")).lower()
8676
if return_format == "html":
8777
html_content = json2html(
8878
text="",
@@ -102,24 +92,25 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
10292
return HTMLResponse(content=html_content)
10393

10494
if return_format == "ujson":
105-
return UJSONResponse(content=document_tree.to_api_schema().model_dump())
95+
return UJSONResponse(content=document_tree.model_dump())
10696

10797
if return_format == "collapsed_tree":
10898
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
10999
return HTMLResponse(content=html_content)
110100

111101
if return_format == "pretty_json":
112-
return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
102+
return PlainTextResponse(content=json.dumps(document_tree.model_dump(), ensure_ascii=False, indent=2))
113103

114104
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
115-
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
105+
return ORJSONResponse(content=document_tree.model_dump())
116106

117107

118108
@app.get("/upload_example")
119-
async def upload_example(file_name: str, return_format: Optional[str] = None) -> Response:
109+
async def upload_example(request: Request, file_name: str, return_format: Optional[str] = None) -> Response:
120110
file_path = os.path.join(static_path, "examples", file_name)
121111
parameters = {} if return_format is None else {"return_format": return_format}
122-
document_tree = manager.parse(file_path, parameters=parameters)
112+
with tempfile.TemporaryDirectory() as tmpdir:
113+
document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir)
123114

124115
if return_format == "html":
125116
html_page = json2html(
@@ -130,12 +121,11 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) ->
130121
tabs=0
131122
)
132123
return HTMLResponse(content=html_page)
133-
return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200)
124+
return ORJSONResponse(content=document_tree.model_dump(), status_code=200)
134125

135126

136127
@app.exception_handler(DedocError)
137128
async def exception_handler(request: Request, exc: DedocError) -> Response:
138-
logger.error(f"Exception {exc}\n{traceback.format_exc()}")
139129
result = {"message": exc.msg}
140130
if exc.filename:
141131
result["file_name"] = exc.filename

dedoc/api/process_handler.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import asyncio
2+
import base64
3+
import logging
4+
import os
5+
import pickle
6+
import signal
7+
import traceback
8+
from multiprocessing import Process, Queue
9+
from typing import Optional
10+
from urllib.request import Request
11+
12+
from anyio import get_cancelled_exc_class
13+
14+
from dedoc.api.cancellation import cancel_on_disconnect
15+
from dedoc.api.schema import ParsedDocument
16+
from dedoc.common.exceptions.dedoc_error import DedocError
17+
from dedoc.config import get_config
18+
from dedoc.dedoc_manager import DedocManager
19+
20+
21+
class ProcessHandler:
22+
"""
23+
Class for file parsing by DedocManager with support for client disconnection.
24+
If client disconnects during file parsing, the process of parsing is fully terminated and API is available to receive new connections.
25+
26+
Handler uses the following algorithm:
27+
1. Master process is used for checking current connection (client disconnect)
28+
2. Child process is working on the background and waiting for the input file in the input_queue
29+
3. Master process calls the child process for parsing and transfers data through the input_queue
30+
4. Child process is parsing file using DedocManager
31+
5. The result of parsing is transferred to the master process through the output_queue
32+
6. If client disconnects, the child process is terminated. The new child process with queues will start with the new request
33+
"""
34+
def __init__(self, logger: logging.Logger) -> None:
35+
self.input_queue = Queue()
36+
self.output_queue = Queue()
37+
self.logger = logger
38+
self.process = Process(target=self.__parse_file, args=[self.input_queue, self.output_queue])
39+
self.process.start()
40+
41+
async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
42+
"""
43+
Handle request in a separate process.
44+
Checks for client disconnection and terminate the child process if client disconnected.
45+
"""
46+
if self.process is None:
47+
self.logger.info("Initialization of a new parsing process")
48+
self.__init__(logger=self.logger)
49+
50+
self.logger.info("Putting file to the input queue")
51+
self.input_queue.put(pickle.dumps((parameters, file_path, tmpdir)), block=True)
52+
53+
loop = asyncio.get_running_loop()
54+
async with cancel_on_disconnect(request, self.logger):
55+
try:
56+
future = loop.run_in_executor(None, self.output_queue.get)
57+
result = await future
58+
except get_cancelled_exc_class():
59+
self.logger.warning("Terminating the parsing process")
60+
if self.process is not None:
61+
self.process.terminate()
62+
self.process = None
63+
future.cancel(DedocError)
64+
return None
65+
66+
result = pickle.loads(result)
67+
if isinstance(result, ParsedDocument):
68+
self.logger.info("Got the result from the output queue")
69+
return result
70+
71+
raise DedocError.from_dict(result)
72+
73+
def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
74+
"""
75+
Function for file parsing in a separate (child) process.
76+
It's a background process, i.e. it is waiting for a task in the input queue.
77+
The result of parsing is returned in the output queue.
78+
79+
Operations with `signal` are used for saving master process while killing child process.
80+
See the issue for more details: https://github.com/fastapi/fastapi/issues/1487
81+
"""
82+
signal.set_wakeup_fd(-1)
83+
signal.signal(signal.SIGTERM, signal.SIG_DFL)
84+
signal.signal(signal.SIGINT, signal.SIG_DFL)
85+
86+
manager = DedocManager(config=get_config())
87+
manager.logger.info("Parsing process is waiting for the task in the input queue")
88+
89+
while True:
90+
file_path = None
91+
try:
92+
parameters, file_path, tmp_dir = pickle.loads(input_queue.get(block=True))
93+
manager.logger.info("Parsing process got task from the input queue")
94+
return_format = str(parameters.get("return_format", "json")).lower()
95+
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmp_dir})
96+
97+
if return_format == "html":
98+
self.__add_base64_info_to_attachments(document_tree, tmp_dir)
99+
100+
output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True)
101+
manager.logger.info("Parsing process put task to the output queue")
102+
except DedocError as e:
103+
tb = traceback.format_exc()
104+
manager.logger.error(f"Exception {e}: {e.msg_api}\n{tb}")
105+
output_queue.put(pickle.dumps(e.__dict__), block=True)
106+
except Exception as e:
107+
exc_message = f"Exception {e}\n{traceback.format_exc()}"
108+
filename = "" if file_path is None else os.path.basename(file_path)
109+
manager.logger.error(exc_message)
110+
output_queue.put(pickle.dumps({"msg": exc_message, "filename": filename}), block=True)
111+
112+
def __add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
113+
for attachment in document_tree.attachments:
114+
with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
115+
attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))

0 commit comments

Comments
 (0)