Support the usage of groups by the table plugin.

bosd · bosd · commit ac11ba81f98d · 2025-02-08T14:36:04.000+01:00
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -308,6 +308,17 @@ The plugin uses the following properties:
 * **`end`**: A regex to identify the end of the table.
 * **`body`**: A regex with named capture groups to extract the data. The names of the capture groups will become the field names in the output.
 
+**Optional Properties**
+
+* **`type`**:  Specifies the data type of the extracted value. Can be `int`, `float`, or `date`.
+* **`group`**: Defines how to handle multiple matches. Options include:
+    * `sum`: Sum the values.
+    * `min`: Return the minimum value.
+    * `max`: Return the maximum value.
+    * `first`: Return the first match.
+    * `last`: Return the last match.
+    * `join`: Join the matches into a single string.
+
 The plugin will try to match the `body` regex to the text between the `start` and `end` markers.
 
 **Example Invoice**
@@ -339,7 +350,9 @@ headings. A template to capture these fields may look like:
     tables:
       - start: Hotel Details\s+Check In\s+Check Out\s+Rooms
         end: Booking ID
-        body: (?P<hotel_details>[\S ]+),\s+(?P<date_check_in>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<date_check_out>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<amount_rooms>\d+)
+        body: (?P<hotel_details>[\S ]+),\s+(?P<date_check_in>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<date_check_out>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<qty_rooms>\d+)
+        types:
+          qty_rooms: int
       - start: Booking ID\s+Payment Mode
         end: DESCRIPTION
         body: (?P<booking_id>\w+)\s+(?P<payment_method>(?:\w+ ?)*)
@@ -350,6 +363,19 @@ By default, all fields are parsed as strings. The `tables` plugin
 supports the `amount` and `date` field naming conventions to convert
 data types.
 
+The table plugin supports the grouping options in case there are multiple matches.
+This is usefull when one wants to sum the numbers in a column, Example:
+```yaml
+    tables:
+      - start: Basic example to sum a number
+        end: with the help of the table plugin
+        body: (?P<random_num_to_sum>\d+\.\d+)
+        fields:
+          random_num_to_sum:
+            group: sum
+            type: float
+```
+
 ### Options
 
 Everything under `options` is optional. We expect to add more options in
diff --git a/src/invoice2data/extract/plugins/tables.py b/src/invoice2data/extract/plugins/tables.py
@@ -7,6 +7,8 @@
 from typing import Dict
 from typing import Optional
 
+from ..utils import _apply_grouping
+
 
 logger = getLogger(__name__)
 
@@ -21,11 +23,12 @@ def extract(
     Args:
         self (InvoiceTemplate): The current instance of the class.  # noqa: DOC103
         content (str): The content of the invoice.
-        output (Dict[str, Any]): A dictionary to store the extracted data.
+        output (Dict[str, Any]): The updated output dictionary with extracted
+                                    data or None if parsing fails.
 
     Returns:
-        Optional[Dict[str, Any]]: The updated output dictionary with extracted
-                                   data, or None if date parsing fails.
+        Optional[List[Any]]: The extracted data as a list of dictionaries, or None if table parsing fails.
+                                Each dictionary represents a row in the table.
     """
     for i, table in enumerate(self["tables"]):
         logger.debug("Testing Rules set #%s", i)
@@ -41,8 +44,18 @@ def extract(
             continue
 
         # Process table lines
-        if not _process_table_lines(self, table, table_body, output):
-            return None  # Return None if date parsing fails
+        table_data = _process_table_lines(self, table, table_body)
+        if table_data is None:
+            continue
+
+        # Apply grouping to individual fields within table_data
+        for field, field_settings in table.get("fields", {}).items():
+            if "group" in field_settings:
+                grouped_value = _apply_grouping(field_settings, table_data.get(field))
+                if grouped_value is not None:
+                    table_data[field] = grouped_value
+
+        output.update(table_data)
 
     return output
 
@@ -104,29 +117,28 @@ def _process_table_lines(
     self: "OrderedDict[str, Any]",
     table: Dict[str, Any],
     table_body: str,
-    output: Dict[str, Any],
-) -> bool:
+) -> Optional[Dict[str, Any]]:
     """Process the lines within the table body.
 
     Args:
         self (InvoiceTemplate): The current instance of the class.  # noqa: DOC103
         table (Dict[str, Any]): The validated table settings.
         table_body (str): The extracted table body.
-        output (Dict[str, Any]): A dictionary to store the extracted data.
 
     Returns:
-        bool: True if processing is successful, False if date parsing fails.
+        List[Dict[str, Any]]: A list of dictionaries, where each dictionary
+                              represents a row in the table.
     """
     types = table.get("types", {})
     no_match_found = True
-
+    line_output: Dict[str, Any] = {}
     for line in re.split(table["line_separator"], table_body):
         if not line.strip("").strip("\n") or line.isspace():
             continue
 
         # Correct the function call and return logic
-        if not _process_table_line(self, table, line, types, output):
-            return False  # Return False immediately if date parsing fails
+        if not _process_table_line(self, table, line, types, line_output):
+            return None  # Return None immediately if line parsing fails
         else:
             no_match_found = (
                 False  # Update no_match_found only if line processing is successful
@@ -137,10 +149,11 @@ def _process_table_lines(
             "\033[1;43mWarning\033[0m regex=\033[91m*%s*\033[0m doesn't match anything!",
             table["body"],
         )
-    return True
 
+    return line_output
 
-def _process_table_line(
+
+def _process_table_line(  # noqa: C901
     self: "OrderedDict[str, Any]",
     table: Dict[str, Any],
     line: str,
@@ -162,9 +175,6 @@ def _process_table_line(
     match = re.search(table["body"], line)
     if match:
         for field, value in match.groupdict().items():
-            if field in output:
-                continue
-
             logger.debug(
                 (
                     "field=\033[1m\033[93m%s\033[0m |"
@@ -177,18 +187,32 @@ def _process_table_line(
             )
 
             if field.startswith("date") or field.endswith("date"):
-                output[field] = self.parse_date(value)  # type: ignore[attr-defined]
-                if not output[field]:
+                value = self.parse_date(value)  # type: ignore[attr-defined]
+                if not value:
                     logger.error("Date parsing failed on date *%s*", value)
                     return False
             elif field.startswith("amount"):
-                output[field] = self.parse_number(value)  # type: ignore[attr-defined]
+                value = self.parse_number(value)  # type: ignore[attr-defined]
             elif field in types:
-                # Access types as a dictionary
-                output[field] = self.coerce_type(value, types[field])  # type: ignore[attr-defined]
+                value = self.coerce_type(value, types[field])  # type: ignore[attr-defined]
+            elif table.get("fields"):
+                # Writing templates is hard. So we also support the following format
+                # In case someone mixup syntax
+                # fields:
+                #    example_field:
+                #      type: float
+                #      group: sum
+                field_set = table["fields"].get(field, {})
+                if "type" in field_set:
+                    value = self.coerce_type(value, field_set.get("type"))  # type: ignore[attr-defined]
+
+            if field in output:
+                # Ensure output[field] is a list before appending
+                if not isinstance(output[field], list):
+                    output[field] = [output[field]]
+                output[field].append(value)
             else:
                 output[field] = value
-
         # Return True if a match is found and processed successfully
         return True
     else:
diff --git a/tests/custom/table-groups.json b/tests/custom/table-groups.json
@@ -0,0 +1,20 @@
+[
+  {
+    "issuer": "Table Groups Tests",
+    "date": "2024-12-20",
+    "invoice_number": "007/10/2024",
+    "amount": 123.4,
+    "currency": "EUR",
+    "hotel_details": [
+      "OYO 4189 Resort Nanganallur",
+      "OYO 4189 Resort Nanganallur Suite A"
+    ],
+    "date_check_in": "2024-01-08",
+    "date_check_out": "2024-12-31",
+    "qty_rooms": 2,
+    "line_tax_percent": ["1%", "2%", "0%"],
+    "lamount_tax": ["3.00", "2.00", "0.00"],
+    "random_num_to_sum": 11.01,
+    "desc": "Invoice from Table Groups Tests"
+  }
+]
diff --git a/tests/custom/table-groups.txt b/tests/custom/table-groups.txt
@@ -0,0 +1,31 @@
+Issue date: 2024-12-20
+Issuer: Table Group Tests
+Invoice number: 007/10/2024
+Total: 123.40 EUR
+
+Table basic
+
+Simple table start
+Tax precentage     amount   qty
+1%                 3.00     7.00
+2%                  2.00    4.00
+0%                  0.00    0.01
+Simple table end
+
+
+
+Sample data below to test advanced grouping functions of table parser.
+
+Guest Name: Sanjay
+
+Hotel Details                                                   Check In            Check Out       Rooms
+OYO 4189 Resort Nanganallur,                                    01/08/2024          01/01/2018      1
+OYO 4189 Resort Nanganallur Suite A,                            31/12/2017          31/12/2024      1
+25,Vembuliamman Koil Street,, Pazhavanthangal, Chennai
+                                                                    Booking ID              Payment Mode
+                                                                    IBZY2087                Cash at Hotel
+
+
+invoice2data --input-reader=text --debug ./table-groups.txt
+
+invoice2data ./table-groups.txt --debug -t ./templates
diff --git a/tests/custom/templates/table-groups.yml b/tests/custom/templates/table-groups.yml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: MIT
+issuer: Table Groups Tests
+keywords:
+  - Table basic
+  - Simple table start
+
+tables:
+  - start: Hotel Details\s+Check In\s+Check Out\s+Rooms
+    end: Booking ID
+    body: (?P<hotel_details>[\S ]+),\s+(?P<date_check_in>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<date_check_out>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<qty_rooms>\d)
+    types:
+      qty_rooms: int
+    fields:
+      qty_rooms:
+        group: sum
+      date_check_in:
+        group: first
+      date_check_out:
+        group: last
+  - start: Tax precentage     amount   qty
+    end: Simple table end
+    body: (?P<line_tax_percent>\d[%])\s+(?P<lamount_tax>\d\.\d{2})\s+(?P<random_num_to_sum>\d\.\d{2})
+    fields:
+      random_num_to_sum:
+        group: sum
+        # type: float # This is also supported
+    types:
+      random_num_to_sum: float # this is supported
+
+fields:
+  date:
+    parser: regex
+    regex: Issue date:\s*(\d{4}-\d{2}-\d{2})
+    type: date
+  invoice_number:
+    parser: regex
+    regex: Invoice number:\s*([\d/]+)
+  amount:
+    parser: regex
+    regex: Total:\s*(\d+\.\d\d)
+    type: float
+options:
+  currency: EUR
+  date_formats:
+    - "%Y-%m-%d"
+  decimal_separator: "."