Skip to content

Commit 26d96d5

Browse files
authored
Merge pull request #13 from gowthamshankar99/doc-support
[Enhance] Support for doc and docx files
2 parents cfb4bc1 + ee673b4 commit 26d96d5

File tree

2 files changed

+40
-2
lines changed

2 files changed

+40
-2
lines changed

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@ rpds-py==0.18.0 ; python_version >= "3.10"
2323
s3transfer==0.10.1 ; python_version >= "3.10"
2424
six==1.16.0 ; python_version >= "3.10"
2525
typing-extensions==4.10.0 ; python_version >= "3.10"
26-
urllib3==2.2.1 ; python_version >= "3.10"
26+
urllib3==2.2.1 ; python_version >= "3.10"

src/rhubarb/file_converter/file_converter.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,14 @@
99

1010
import boto3
1111
import pdfplumber
12-
from PIL import Image
12+
from PIL import Image, ImageDraw
1313

14+
try:
15+
from docx import Document
16+
17+
DOCX_AVAILABLE = True
18+
except ImportError:
19+
DOCX_AVAILABLE = False
1420
from .image_validator import ImageValidator
1521

1622
logger = logging.getLogger(__name__)
@@ -144,6 +150,38 @@ def convert_to_base64(self) -> List[Dict[str, Union[int, str]]]:
144150
base64_string = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
145151
base64_strings.append({"page": i + 1, "base64string": base64_string})
146152
return base64_strings
153+
elif self.mime_type in [
154+
"application/msword",
155+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
156+
]:
157+
if not DOCX_AVAILABLE:
158+
raise ImportError(
159+
"The 'python-docx' library is not installed. Please install it to process .docx files."
160+
)
161+
document = Document(
162+
BytesIO(self.file_bytes)
163+
if self.file_path.startswith("s3://")
164+
else self.file_path
165+
)
166+
base64_strings = []
167+
page_count = len(document.paragraphs) # Assuming paragraphs as a proxy for pages
168+
if self.pages == [0]:
169+
page_nums = range(min(20, page_count))
170+
else:
171+
page_nums = [p - 1 for p in self.pages if p <= page_count and p > 0]
172+
173+
for page_num in page_nums:
174+
paragraph = document.paragraphs[page_num].text
175+
img = Image.new(
176+
"RGB", (800, 600), color=(255, 255, 255)
177+
) # Placeholder image for paragraph
178+
d = ImageDraw.Draw(img)
179+
d.text((10, 10), paragraph, fill=(0, 0, 0))
180+
img_bytes = BytesIO()
181+
img.save(img_bytes, format="PNG")
182+
base64_string = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
183+
base64_strings.append({"page": page_num + 1, "base64string": base64_string})
184+
return base64_strings
147185
else:
148186
logger.error("Unsupported file type")
149187
raise ValueError("Unsupported file type")

0 commit comments

Comments
 (0)