Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to pdf table parser #168

Merged
merged 1 commit into from
May 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/modules/metadata_store/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def get_collections(
) -> List[Collection]:
return [self.collection]

def list_collections(
async def list_collections(
self,
) -> List[str]:
return [self.collection.name]
Expand Down
72 changes: 37 additions & 35 deletions backend/modules/parsers/tablepdfparser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
import re

import pandas as pd
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from backend.logger import logger
from backend.modules.parsers.parser import BaseParser
from backend.modules.parsers.utils import contains_text

Expand All @@ -15,11 +17,14 @@ class PdfTableParser(BaseParser):

supported_file_extensions = [".pdf"]

def __init__(self, max_chunk_size: int = 1024, *args, **kwargs):
def __init__(
self, max_chunk_size: int = 1024, chunk_overlap: int = 20, *args, **kwargs
):
"""
Initializes the PdfTableParser object.
"""
self.max_chunk_size = max_chunk_size
self.chunk_overlap = chunk_overlap

async def get_chunks(self, filepath, metadata, *args, **kwargs):
"""
Expand Down Expand Up @@ -47,7 +52,7 @@ async def get_chunks(self, filepath, metadata, *args, **kwargs):
"OCR.USE_TESSERACT=False",
]

print("Parsing file - " + str(filepath))
logger.info("Parsing file - " + str(filepath))
analyzer = dd.get_dd_analyzer(
reset_config_file=False, config_overwrite=config_overwrite
)
Expand All @@ -63,42 +68,39 @@ async def get_chunks(self, filepath, metadata, *args, **kwargs):
if len(tables) > 0:
for table in tables:
table_data = table.csv
print(
logger.info(
"-----------------Table for page - "
+ str(page.page_number)
+ "---------------------"
)

table_data = pd.DataFrame(table_data)
# Table data is a pandas DataFrame, convert it to console string
table_data = table_data.to_string()
if table_data and contains_text(table_data):
tab_doc = [
Document(
page_content=table_data,
metadata={
"page_num": page.page_number,
"type": "table",
"table_num": ix,
},
)
]
table_docs.extend(tab_doc)
if table_data and contains_text(table_data):
tab_doc = [
Document(
page_content=table_data,
metadata={
"page_num": page.page_number,
"type": "table",
"table_num": ix,
},
)
]
table_docs.extend(tab_doc)
try:
# give a temporary path to save the table data
table_path = filepath + "_table_data.csv"
table_data.to_csv(table_path, index=False, header=False)
table_data = pd.read_csv(table_path, header=0)
table_data = table_data.to_string(index=False, header=False)
logger.info(f"Table data String:\n {table_data}")
# input("Press Enter to continue...")
if contains_text(table_data):
tab_doc = [
Document(
page_content=table_data,
metadata={
"page_num": page.page_number + 1,
"type": "table",
"table_num": ix,
},
)
]
table_docs.extend(tab_doc)
os.remove(table_path)
except Exception as ex:
logger.error(f"Error while processing table data: {str(ex)}")
# Return an empty list if there was an error during processing
continue

text = page.text

# clean up text for any problematic characters
text = re.sub("\n", " ", text).strip()
text = text.encode("ascii", errors="ignore").decode("ascii")
Expand All @@ -109,14 +111,14 @@ async def get_chunks(self, filepath, metadata, *args, **kwargs):
if len(text) > self.max_chunk_size:
# Split the text into chunks of size less than or equal to max_chunk_size
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.max_chunk_size, chunk_overlap=200
chunk_size=self.max_chunk_size, chunk_overlap=self.chunk_overlap
)
text_splits = text_splitter.split_text(text)
texts = [
Document(
page_content=text_split,
metadata={
"page_num": page.page_number,
"page_num": page.page_number + 1,
"type": "text",
},
)
Expand All @@ -131,13 +133,13 @@ async def get_chunks(self, filepath, metadata, *args, **kwargs):
Document(
page_content=text,
metadata={
"page_num": page.page_number,
"page_num": page.page_number + 1,
"type": "text",
},
)
)
# except Exception as ex:
# print(f"Error while parsing PDF file at {filepath}: {str(ex)}")
# logger.info(f"Error while parsing PDF file at {filepath}: {str(ex)}")
# # Return an empty list if there was an error during processing
# return []

Expand Down
28 changes: 22 additions & 6 deletions local.metadata.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
collection_name: finance
# Test 1
# collection_name: sdocs1
# data_source:
# type: localdir
# uri: /Users/prathamesh/Desktop/s-docs
# parser_config:
# chunk_size: 1000
# chunk_overlap: 40
# parser_map:
# ".pdf": PdfTableParser
# embedder_config:
# provider: mixedbread
# config:
# model: "mixedbread-ai/mxbai-embed-large-v1"

# Test 2
collection_name: sdocs2string
data_source:
type: localdir
uri: ./sample-data/finance
uri: /Users/prathamesh/Desktop/s-docs
parser_config:
chunk_size: 1000
chunk_overlap: 20
chunk_overlap: 40
parser_map:
".pdf": MultiModalParser
".pdf": PdfTableParser
embedder_config:
provider: truefoundry
provider: mixedbread
config:
model: "openai-main/text-embedding-ada-002"
model: "mixedbread-ai/mxbai-embed-large-v1"