Skip to content

Commit 4531ff5

Browse files
committed
[REF] Main.py
Reduce complexity Removed unnecessary list conversion: The templates = list(templates) line is removed as it's not needed since we're iterating through the templates directly. Removed result initialization: The result variable initialization is removed as we're now returning directly from the loop if a match is found. Return directly on match: If a template matches, the function now returns the result of template.extract directly within the loop. Simplified OCR fallback: The OCR fallback logic is simplified by directly returning the result if a template is found. Explanation: The original code had some unnecessary steps and nested conditions that increased the complexity. By removing the redundant list conversion and returning directly from the loop when a match is found, we simplify the control flow and reduce the cognitive complexity of the function. The simplified OCR fallback logic also contributes to reducing the complexity by avoiding unnecessary nesting and making the code easier to follow. Added typehint for `Templates`. This fixes mypy error. Changes in ocrmypdf fallback: Renamed template to templates_matched: I renamed the template variable to templates_matched in the assignment to make it clearer that extract_data_fallback_ocrmypdf returns a list of templates. Checked for matched templates: I added a check if templates_matched: to ensure that at least one template matched before trying to access the first element. Assigned the first matched template: If templates_matched is not empty, I assign the first template in the list to the template variable.
1 parent 22c221b commit 4531ff5

File tree

1 file changed

+23
-35
lines changed

1 file changed

+23
-35
lines changed

src/invoice2data/__main__.py

+23-35
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ def format(self, record: logging.LogRecord) -> str:
109109

110110

111111
def extract_data(
112-
invoicefile: str, templates: Optional[List[Any]] = None, input_module: Any = None
112+
invoicefile: str,
113+
templates: Optional[List[InvoiceTemplate]] = None,
114+
input_module: Any = None,
113115
) -> Dict[str, Any]:
114116
"""Extracts structured data from PDF/image invoices.
115117
@@ -121,8 +123,8 @@ def extract_data(
121123
122124
Args:
123125
invoicefile (str): Path of electronic invoice file in PDF, JPEG, PNG
124-
templates (Optional[List[Any]]): List of instances of class `InvoiceTemplate`.
125-
Templates are loaded using `read_template` function in `loader.py`.
126+
templates (Optional[List[InvoiceTemplate]]): List of instances of class `InvoiceTemplate`.
127+
Templates are loaded using `read_template` function in `loader.py`.
126128
input_module (Any, optional): Library to be used to extract text
127129
from the given `invoicefile`.
128130
Choices: {'pdftotext', 'pdfminer', 'tesseract', 'text'}.
@@ -148,10 +150,7 @@ def extract_data(
148150
if isinstance(input_module, str):
149151
input_module = input_mapping[input_module]
150152
elif input_module is None:
151-
if invoicefile.lower().endswith(".txt"):
152-
input_module = text
153-
else:
154-
input_module = pdftotext
153+
input_module = text if invoicefile.lower().endswith(".txt") else pdftotext
155154

156155
extracted_str = input_module.to_text(invoicefile)
157156
if not isinstance(extracted_str, str) or not extracted_str.strip():
@@ -167,39 +166,28 @@ def extract_data(
167166
)
168167
logger.debug("END pdftotext result =============================")
169168

170-
if not templates:
171-
templates = read_templates()
172-
173-
# Convert templates to a list to allow indexing
174-
templates = list(templates)
169+
templates = templates or read_templates()
175170

176-
# Initialize result as an empty dictionary
177-
result: Dict[str, Any] = {}
178171
for template in templates:
179172
if template.matches_input(extracted_str):
180173
logger.info("Using %s template", template["template_name"])
181174
optimized_str = template.prepare_input(extracted_str)
182-
result = template.extract(optimized_str, invoicefile, input_module)
183-
break
184-
185-
if not result:
186-
if ocrmypdf.ocrmypdf_available() and input_module is not ocrmypdf:
187-
logger.debug("Text extraction failed, falling back to ocrmypdf")
188-
extracted_str, invoicefile, templates_matched = (
189-
extract_data_fallback_ocrmypdf(invoicefile, templates, input_module)
190-
)
191-
if templates_matched:
192-
result = templates_matched[0].extract(
193-
extracted_str, invoicefile, input_module
194-
)
195-
else:
196-
logger.error("No template for %s", invoicefile)
197-
return {}
198-
else:
199-
logger.error("No template for %s", invoicefile)
200-
return {}
201-
202-
return deepcopy(result)
175+
return template.extract(
176+
optimized_str, invoicefile, input_module
177+
) # Return directly if match found
178+
179+
# If no template matches, try OCR fallback
180+
if ocrmypdf.ocrmypdf_available() and input_module is not ocrmypdf:
181+
logger.debug("Text extraction failed, falling back to ocrmypdf")
182+
extracted_str, invoicefile, templates_matched = extract_data_fallback_ocrmypdf(
183+
invoicefile, templates, input_module
184+
)
185+
if templates_matched:
186+
template = templates_matched[0]
187+
return template.extract(extracted_str, invoicefile, input_module)
188+
189+
logger.error("No template for %s", invoicefile)
190+
return {}
203191

204192

205193
def extract_data_fallback_ocrmypdf(

0 commit comments

Comments
 (0)