|
9 | 9 |
|
10 | 10 | import boto3
|
11 | 11 | import pdfplumber
|
12 |
| -from PIL import Image |
| 12 | +from PIL import Image, ImageDraw |
13 | 13 |
|
| 14 | +try: |
| 15 | + from docx import Document |
| 16 | + |
| 17 | + DOCX_AVAILABLE = True |
| 18 | +except ImportError: |
| 19 | + DOCX_AVAILABLE = False |
14 | 20 | from .image_validator import ImageValidator
|
15 | 21 |
|
16 | 22 | logger = logging.getLogger(__name__)
|
@@ -144,6 +150,38 @@ def convert_to_base64(self) -> List[Dict[str, Union[int, str]]]:
|
144 | 150 | base64_string = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
145 | 151 | base64_strings.append({"page": i + 1, "base64string": base64_string})
|
146 | 152 | return base64_strings
|
| 153 | + elif self.mime_type in [ |
| 154 | + "application/msword", |
| 155 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 156 | + ]: |
| 157 | + if not DOCX_AVAILABLE: |
| 158 | + raise ImportError( |
| 159 | + "The 'python-docx' library is not installed. Please install it to process .docx files." |
| 160 | + ) |
| 161 | + document = Document( |
| 162 | + BytesIO(self.file_bytes) |
| 163 | + if self.file_path.startswith("s3://") |
| 164 | + else self.file_path |
| 165 | + ) |
| 166 | + base64_strings = [] |
| 167 | + page_count = len(document.paragraphs) # Assuming paragraphs as a proxy for pages |
| 168 | + if self.pages == [0]: |
| 169 | + page_nums = range(min(20, page_count)) |
| 170 | + else: |
| 171 | + page_nums = [p - 1 for p in self.pages if p <= page_count and p > 0] |
| 172 | + |
| 173 | + for page_num in page_nums: |
| 174 | + paragraph = document.paragraphs[page_num].text |
| 175 | + img = Image.new( |
| 176 | + "RGB", (800, 600), color=(255, 255, 255) |
| 177 | + ) # Placeholder image for paragraph |
| 178 | + d = ImageDraw.Draw(img) |
| 179 | + d.text((10, 10), paragraph, fill=(0, 0, 0)) |
| 180 | + img_bytes = BytesIO() |
| 181 | + img.save(img_bytes, format="PNG") |
| 182 | + base64_string = base64.b64encode(img_bytes.getvalue()).decode("utf-8") |
| 183 | + base64_strings.append({"page": page_num + 1, "base64string": base64_string}) |
| 184 | + return base64_strings |
147 | 185 | else:
|
148 | 186 | logger.error("Unsupported file type")
|
149 | 187 | raise ValueError("Unsupported file type")
|
|
0 commit comments