-
Notifications
You must be signed in to change notification settings - Fork 321
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Some changes for table parsing * Allowed local embedding models * Added examples * Added summary controller * update: change prompt template based on retriever example (#141) * Updated table parser * fix: retriever selection (#140) * update: change prompt template based on retriever example --------- Co-authored-by: Prathamesh Saraf <prathamesh@mbp.local> Co-authored-by: SayanDeveloper <samantasayan23@gmail.com> * update: re-ordering controllers, collections (#143) * Updated table parser * fix: retriever selection (#140) * update: change prompt template based on retriever example (#142) * update: re-ordering controllers, collections --------- Co-authored-by: Prathamesh Saraf <prathamesh@mbp.local> Co-authored-by: SayanDeveloper <samantasayan23@gmail.com> * Added summary * Updated docker file * Added tf parser * Updated local metadata and added new controller * Removed tfy-api key from modules * Fixed linting * Fixing parser * Integrated tf-parser to indexing --------- Co-authored-by: Prathamesh Saraf <prathamesh@mbp.local> Co-authored-by: sayan-truefoundry <136362719+sayan-truefoundry@users.noreply.github.com> Co-authored-by: SayanDeveloper <samantasayan23@gmail.com>
- Loading branch information
1 parent
a473c61
commit 79d8372
Showing
13 changed files
with
1,210 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import os | ||
from typing import Optional | ||
|
||
import fitz | ||
import requests | ||
from langchain.docstore.document import Document | ||
|
||
from backend.logger import logger | ||
from backend.modules.parsers.parser import BaseParser | ||
from backend.settings import settings | ||
|
||
|
||
class TfParser(BaseParser): | ||
""" | ||
TfParser is a multi-modal parser class for deep extraction of pdf documents. | ||
Requires a running instance of the TfParser service that has access to TFLLM Gateway | ||
""" | ||
|
||
supported_file_extensions = [".pdf"] | ||
|
||
def __init__(self, max_chunk_size: int = 1000, *args, **kwargs): | ||
""" | ||
Initializes the TfParser object. | ||
""" | ||
self.max_chunk_size = max_chunk_size | ||
self.tf_service_url = settings.TF_PARSER | ||
|
||
async def _send_file_request(self, payload: dict, endpoint: str): | ||
""" | ||
Sends a POST request to the TfParser service. | ||
""" | ||
response = requests.post( | ||
self.tf_service_url.rstrip("/") + endpoint, | ||
files=payload, | ||
) | ||
if "error" in response: | ||
print(f"Error: {response.json()['error']}") | ||
return None | ||
return response | ||
|
||
async def _send_text_request(self, payload: dict, endpoint: str): | ||
""" | ||
Sends a POST request to the TfParser service. | ||
""" | ||
response = requests.post( | ||
self.tf_service_url.rstrip("/") + endpoint, json=payload | ||
) | ||
if "error" in response: | ||
print(f"Error: {response.json()['error']}") | ||
return None | ||
return response | ||
|
||
async def get_chunks( | ||
self, filepath: str, metadata: Optional[dict] = None, *args, **kwargs | ||
): | ||
""" | ||
Asynchronously extracts text from a PDF file and returns it in chunks. | ||
""" | ||
if not filepath.endswith(".pdf"): | ||
print("Invalid file extension. TfParser only supports PDF files.") | ||
return [] | ||
page_texts = list() | ||
final_texts = list() | ||
|
||
try: | ||
# Open the PDF file using pdfplumber | ||
doc = fitz.open(filepath) | ||
|
||
# get file path & name | ||
head, tail = os.path.split(filepath) | ||
|
||
for page in doc: | ||
try: | ||
page_number = page.number + 1 | ||
print(f"\n\nProcessing page {page_number}...") | ||
|
||
content = fitz.open() | ||
# copy over current page | ||
content.insert_pdf(doc, from_page=page.number, to_page=page.number) | ||
# save the page to a temporary file | ||
temp_file = os.path.join(head, f"{tail}-{page_number}.pdf") | ||
content.save(temp_file) | ||
content.close() | ||
|
||
# send the page to the TfParser service | ||
with open(temp_file, "rb") as f: | ||
response = await self._send_file_request( | ||
payload={ | ||
"file": f, | ||
}, | ||
endpoint="/tf-parse-pdf", | ||
) | ||
# Parse the response | ||
response = response.json() | ||
|
||
if "error" not in response: | ||
for res in response: | ||
page_content = res.get("page_content").strip() | ||
if ( | ||
page_content != "" | ||
and page_content is not None | ||
and page_content != " " | ||
): | ||
metadata = res.get("metadata", {}) | ||
metadata["page_number"] = page_number | ||
metadata["source"] = tail | ||
final_texts.append( | ||
Document( | ||
page_content=page_content, | ||
metadata=metadata, | ||
) | ||
) | ||
page_texts.append(page_content) | ||
print( | ||
f"Page Content: {page_content}, \nmetadata-pg-no: {metadata['page_number']}, metadata-type: {metadata['type']}" | ||
) | ||
else: | ||
print(f"Error in Page: {response['error']}") | ||
|
||
# remove the temporary file | ||
print("Removing temp file...") | ||
os.remove(temp_file) | ||
except Exception as e: | ||
print(f"Exception in Page: {e}") | ||
# remove the temporary file | ||
print("Removing temp file...") | ||
os.remove(temp_file) | ||
continue | ||
|
||
document_text = " ".join(page_texts) | ||
if document_text: | ||
print("\n\nProcessing combined doc...") | ||
response = await self._send_text_request( | ||
payload={"text": document_text, "file_name": tail}, | ||
endpoint="/tf-get-response", | ||
) | ||
response = response.json() | ||
if "error" not in response: | ||
page_content = response.get("page_content").strip() | ||
metadata = response.get("metadata", {}) | ||
if ( | ||
page_content != "" | ||
and page_content is not None | ||
and page_content != " " | ||
): | ||
final_texts.append( | ||
Document( | ||
page_content=page_content, | ||
metadata=metadata, | ||
) | ||
) | ||
print(f"Page Content: {page_content}") | ||
else: | ||
print(f"Error: {response['error']}") | ||
return final_texts | ||
except Exception as e: | ||
print(f"Ultimate Exception: {e}") | ||
return final_texts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
from backend.modules.query_controllers.example.controller import ExampleQueryController | ||
from backend.modules.query_controllers.query_controller import register_query_controller | ||
from backend.modules.query_controllers.summary.controller import SummaryQueryController | ||
|
||
register_query_controller("default", ExampleQueryController) | ||
register_query_controller("summary", SummaryQueryController) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.