Skip to content

Commit

Permalink
Tfparser (#148)
Browse files Browse the repository at this point in the history
* Some changes for table parsing

* Allowed local embedding models

* Added examples

* Added summary controller

* update: change prompt template based on retriever example (#141)

* Updated table parser

* fix: retriever selection (#140)

* update: change prompt template based on retriever example

---------

Co-authored-by: Prathamesh Saraf <prathamesh@mbp.local>
Co-authored-by: SayanDeveloper <samantasayan23@gmail.com>

* update: re-ordering controllers, collections (#143)

* Updated table parser

* fix: retriever selection (#140)

* update: change prompt template based on retriever example (#142)

* update: re-ordering controllers, collections

---------

Co-authored-by: Prathamesh Saraf <prathamesh@mbp.local>
Co-authored-by: SayanDeveloper <samantasayan23@gmail.com>

* Added summary

* Updated docker file

* Added tf parser

* Updated local metadata and added new controller

* Removed tfy-api key from modules

* Fixed linting

* Fixing parser

* Integrated tf-parser to indexing

---------

Co-authored-by: Prathamesh Saraf <prathamesh@mbp.local>
Co-authored-by: sayan-truefoundry <136362719+sayan-truefoundry@users.noreply.github.com>
Co-authored-by: SayanDeveloper <samantasayan23@gmail.com>
  • Loading branch information
4 people authored Apr 28, 2024
1 parent a473c61 commit 79d8372
Show file tree
Hide file tree
Showing 13 changed files with 1,210 additions and 43 deletions.
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM --platform=linux/amd64 python:3.10

RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 poppler-utils -y
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 poppler-utils qpdf -y

# Set environment variables
ENV PATH=/virtualenvs/venv/bin:$PATH
Expand Down
2 changes: 2 additions & 0 deletions backend/modules/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from backend.modules.parsers.pdfparser_fast import PdfParserUsingPyMuPDF
from backend.modules.parsers.tablepdfparser import PdfTableParser
from backend.modules.parsers.textparser import TextParser
from backend.modules.parsers.tfparser import TfParser

register_parser("MarkdownParser", MarkdownParser)
register_parser("PdfParserFast", PdfParserUsingPyMuPDF)
register_parser("TextParser", TextParser)
register_parser("PdfTableParser", PdfTableParser)
register_parser("TfParser", TfParser)
12 changes: 12 additions & 0 deletions backend/modules/parsers/tablepdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,18 @@ async def get_chunks(self, filepath, metadata, *args, **kwargs):
)
]
table_docs.extend(tab_doc)
if table_data:
tab_doc = [
Document(
page_content=table_data,
metadata={
"page_num": page.page_number,
"type": "table",
"table_num": ix,
},
)
]
table_docs.extend(tab_doc)

text = page.text

Expand Down
158 changes: 158 additions & 0 deletions backend/modules/parsers/tfparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import os
from typing import Optional

import fitz
import requests
from langchain.docstore.document import Document

from backend.logger import logger
from backend.modules.parsers.parser import BaseParser
from backend.settings import settings


class TfParser(BaseParser):
"""
TfParser is a multi-modal parser class for deep extraction of pdf documents.
Requires a running instance of the TfParser service that has access to TFLLM Gateway
"""

supported_file_extensions = [".pdf"]

def __init__(self, max_chunk_size: int = 1000, *args, **kwargs):
"""
Initializes the TfParser object.
"""
self.max_chunk_size = max_chunk_size
self.tf_service_url = settings.TF_PARSER

async def _send_file_request(self, payload: dict, endpoint: str):
"""
Sends a POST request to the TfParser service.
"""
response = requests.post(
self.tf_service_url.rstrip("/") + endpoint,
files=payload,
)
if "error" in response:
print(f"Error: {response.json()['error']}")
return None
return response

async def _send_text_request(self, payload: dict, endpoint: str):
"""
Sends a POST request to the TfParser service.
"""
response = requests.post(
self.tf_service_url.rstrip("/") + endpoint, json=payload
)
if "error" in response:
print(f"Error: {response.json()['error']}")
return None
return response

async def get_chunks(
self, filepath: str, metadata: Optional[dict] = None, *args, **kwargs
):
"""
Asynchronously extracts text from a PDF file and returns it in chunks.
"""
if not filepath.endswith(".pdf"):
print("Invalid file extension. TfParser only supports PDF files.")
return []
page_texts = list()
final_texts = list()

try:
# Open the PDF file using pdfplumber
doc = fitz.open(filepath)

# get file path & name
head, tail = os.path.split(filepath)

for page in doc:
try:
page_number = page.number + 1
print(f"\n\nProcessing page {page_number}...")

content = fitz.open()
# copy over current page
content.insert_pdf(doc, from_page=page.number, to_page=page.number)
# save the page to a temporary file
temp_file = os.path.join(head, f"{tail}-{page_number}.pdf")
content.save(temp_file)
content.close()

# send the page to the TfParser service
with open(temp_file, "rb") as f:
response = await self._send_file_request(
payload={
"file": f,
},
endpoint="/tf-parse-pdf",
)
# Parse the response
response = response.json()

if "error" not in response:
for res in response:
page_content = res.get("page_content").strip()
if (
page_content != ""
and page_content is not None
and page_content != " "
):
metadata = res.get("metadata", {})
metadata["page_number"] = page_number
metadata["source"] = tail
final_texts.append(
Document(
page_content=page_content,
metadata=metadata,
)
)
page_texts.append(page_content)
print(
f"Page Content: {page_content}, \nmetadata-pg-no: {metadata['page_number']}, metadata-type: {metadata['type']}"
)
else:
print(f"Error in Page: {response['error']}")

# remove the temporary file
print("Removing temp file...")
os.remove(temp_file)
except Exception as e:
print(f"Exception in Page: {e}")
# remove the temporary file
print("Removing temp file...")
os.remove(temp_file)
continue

document_text = " ".join(page_texts)
if document_text:
print("\n\nProcessing combined doc...")
response = await self._send_text_request(
payload={"text": document_text, "file_name": tail},
endpoint="/tf-get-response",
)
response = response.json()
if "error" not in response:
page_content = response.get("page_content").strip()
metadata = response.get("metadata", {})
if (
page_content != ""
and page_content is not None
and page_content != " "
):
final_texts.append(
Document(
page_content=page_content,
metadata=metadata,
)
)
print(f"Page Content: {page_content}")
else:
print(f"Error: {response['error']}")
return final_texts
except Exception as e:
print(f"Ultimate Exception: {e}")
return final_texts
2 changes: 2 additions & 0 deletions backend/modules/query_controllers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from backend.modules.query_controllers.example.controller import ExampleQueryController
from backend.modules.query_controllers.query_controller import register_query_controller
from backend.modules.query_controllers.summary.controller import SummaryQueryController

register_query_controller("default", ExampleQueryController)
register_query_controller("summary", SummaryQueryController)
26 changes: 14 additions & 12 deletions backend/modules/query_controllers/example/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from backend.modules.query_controllers.example.payload import (
QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_MMR_PAYLOAD,
QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_SIMILARITY_PAYLOAD,
QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_SIMILARITY_SCORE_PAYLOAD,
QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_PAYLOAD,
QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_SEARCH_TYPE_MMR_PAYLOAD,
QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_SEARCH_TYPE_SIMILARITY_WITH_SCORE_PAYLOAD,
Expand All @@ -40,24 +41,25 @@

EXAMPLES = {
"vector-store-similarity": QUERY_WITH_VECTOR_STORE_RETRIEVER_PAYLOAD,
"multi-query-similarity-threshold": QUERY_WITH_MULTI_QUERY_RETRIEVER_SIMILARITY_SCORE_PAYLOAD,
"vector-store-similarity-threshold": QUERY_WITH_VECTOR_STORE_RETRIEVER_SIMILARITY_SCORE_PAYLOAD,
"contexual-compression-multi-query-similarity": QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_SIMILARITY_PAYLOAD,
"contextual-compression-multi-query-similarity-threshold": QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_SIMILARITY_SCORE_PAYLOAD,
# Keeping these for future use:
# "contexual-compression-similarity-threshold": QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_SEARCH_TYPE_SIMILARITY_WITH_SCORE_PAYLOAD,
# "vector-store-mmr": QUERY_WITH_VECTOR_STORE_RETRIEVER_MMR_PAYLOAD,
# "vector-store-similarity-threshold": QUERY_WITH_VECTOR_STORE_RETRIEVER_SIMILARITY_SCORE_PAYLOAD,
# "contexual-compression-similarity": QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_PAYLOAD,
# "contexual-compression-similarity-threshold": QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_SEARCH_TYPE_SIMILARITY_WITH_SCORE_PAYLOAD,
# "multi-query-similarity": QUERY_WITH_MULTI_QUERY_RETRIEVER_SIMILARITY_PAYLOAD,
# "multi-query-mmr": QUERY_WITH_MULTI_QUERY_RETRIEVER_MMR_PAYLOAD,
# "contexual-compression-multi-query-similarity": QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_SIMILARITY_PAYLOAD,
}

if settings.LOCAL:
EXAMPLES.update(
{
"contexual-compression-mmr": QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_SEARCH_TYPE_MMR_PAYLOAD,
"contexual-compression-multi-query-mmr": QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_MMR_PAYLOAD,
}
)
# if settings.LOCAL:
# EXAMPLES.update(
# {
# "contexual-compression-similarity": QUERY_WITH_CONTEXTUAL_COMPRESSION_RETRIEVER_PAYLOAD,
# "contexual-compression-multi-query-similarity": QUERY_WITH_CONTEXTUAL_COMPRESSION_MULTI_QUERY_RETRIEVER_SIMILARITY_PAYLOAD,
# "multi-query-similarity": QUERY_WITH_MULTI_QUERY_RETRIEVER_SIMILARITY_PAYLOAD,
# "multi-query-similarity-threshold": QUERY_WITH_MULTI_QUERY_RETRIEVER_SIMILARITY_SCORE_PAYLOAD,
# }
# )


@query_controller("/example-app")
Expand Down
Loading

0 comments on commit 79d8372

Please sign in to comment.