[REF] Main.py

bosd · bosd · commit 4531ff5104b6 · 2024-12-28T22:04:57.000+01:00
Reduce complexity
Removed unnecessary list conversion: The templates = list(templates) line is removed as it's not needed since we're iterating through the templates directly.
Removed result initialization: The result variable initialization is removed as we're now returning directly from the loop if a match is found.
Return directly on match: If a template matches, the function now returns the result of template.extract directly within the loop.
Simplified OCR fallback: The OCR fallback logic is simplified by directly returning the result if a template is found.
Explanation:

The original code had some unnecessary steps and nested conditions that increased the complexity. By removing the redundant list conversion and returning directly from the loop when a match is found, we simplify the control flow and reduce the cognitive complexity of the function.

The simplified OCR fallback logic also contributes to reducing the complexity by avoiding unnecessary nesting and making the code easier to follow.

Added typehint for `Templates`. This fixes mypy error.

Changes in ocrmypdf fallback:
Renamed template to templates_matched: I renamed the template variable to templates_matched in the assignment to make it clearer that extract_data_fallback_ocrmypdf returns a list of templates.
Checked for matched templates: I added a check if templates_matched: to ensure that at least one template matched before trying to access the first element.
Assigned the first matched template: If templates_matched is not empty, I assign the first template in the list to the template variable.
diff --git a/src/invoice2data/__main__.py b/src/invoice2data/__main__.py
@@ -109,7 +109,9 @@ def format(self, record: logging.LogRecord) -> str:
 
 
 def extract_data(
-    invoicefile: str, templates: Optional[List[Any]] = None, input_module: Any = None
+    invoicefile: str,
+    templates: Optional[List[InvoiceTemplate]] = None,
+    input_module: Any = None,
 ) -> Dict[str, Any]:
     """Extracts structured data from PDF/image invoices.
 
@@ -121,8 +123,8 @@ def extract_data(
 
     Args:
         invoicefile (str): Path of electronic invoice file in PDF, JPEG, PNG
-        templates (Optional[List[Any]]): List of instances of class `InvoiceTemplate`.
-                                        Templates are loaded using `read_template` function in `loader.py`.
+        templates (Optional[List[InvoiceTemplate]]): List of instances of class `InvoiceTemplate`.
+                                            Templates are loaded using `read_template` function in `loader.py`.
         input_module (Any, optional): Library to be used to extract text
                                         from the given `invoicefile`.
                                         Choices: {'pdftotext', 'pdfminer', 'tesseract', 'text'}.
@@ -148,10 +150,7 @@ def extract_data(
     if isinstance(input_module, str):
         input_module = input_mapping[input_module]
     elif input_module is None:
-        if invoicefile.lower().endswith(".txt"):
-            input_module = text
-        else:
-            input_module = pdftotext
+        input_module = text if invoicefile.lower().endswith(".txt") else pdftotext
 
     extracted_str = input_module.to_text(invoicefile)
     if not isinstance(extracted_str, str) or not extracted_str.strip():
@@ -167,39 +166,28 @@ def extract_data(
     )
     logger.debug("END pdftotext result =============================")
 
-    if not templates:
-        templates = read_templates()
-
-    # Convert templates to a list to allow indexing
-    templates = list(templates)
+    templates = templates or read_templates()
 
-    # Initialize result as an empty dictionary
-    result: Dict[str, Any] = {}
     for template in templates:
         if template.matches_input(extracted_str):
             logger.info("Using %s template", template["template_name"])
             optimized_str = template.prepare_input(extracted_str)
-            result = template.extract(optimized_str, invoicefile, input_module)
-            break
-
-    if not result:
-        if ocrmypdf.ocrmypdf_available() and input_module is not ocrmypdf:
-            logger.debug("Text extraction failed, falling back to ocrmypdf")
-            extracted_str, invoicefile, templates_matched = (
-                extract_data_fallback_ocrmypdf(invoicefile, templates, input_module)
-            )
-            if templates_matched:
-                result = templates_matched[0].extract(
-                    extracted_str, invoicefile, input_module
-                )
-            else:
-                logger.error("No template for %s", invoicefile)
-                return {}
-        else:
-            logger.error("No template for %s", invoicefile)
-            return {}
-
-    return deepcopy(result)
+            return template.extract(
+                optimized_str, invoicefile, input_module
+            )  # Return directly if match found
+
+    # If no template matches, try OCR fallback
+    if ocrmypdf.ocrmypdf_available() and input_module is not ocrmypdf:
+        logger.debug("Text extraction failed, falling back to ocrmypdf")
+        extracted_str, invoicefile, templates_matched = extract_data_fallback_ocrmypdf(
+            invoicefile, templates, input_module
+        )
+        if templates_matched:
+            template = templates_matched[0]
+            return template.extract(extracted_str, invoicefile, input_module)
+
+    logger.error("No template for %s", invoicefile)
+    return {}
 
 
 def extract_data_fallback_ocrmypdf(