Skip to content

Commit ac11ba8

Browse files
committed
Support the usage of groups by the table plugin.
1 parent fc6c018 commit ac11ba8

File tree

5 files changed

+171
-24
lines changed

5 files changed

+171
-24
lines changed

docs/tutorial.md

+27-1
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,17 @@ The plugin uses the following properties:
308308
* **`end`**: A regex to identify the end of the table.
309309
* **`body`**: A regex with named capture groups to extract the data. The names of the capture groups will become the field names in the output.
310310

311+
**Optional Properties**
312+
313+
* **`type`**: Specifies the data type of the extracted value. Can be `int`, `float`, or `date`.
314+
* **`group`**: Defines how to handle multiple matches. Options include:
315+
* `sum`: Sum the values.
316+
* `min`: Return the minimum value.
317+
* `max`: Return the maximum value.
318+
* `first`: Return the first match.
319+
* `last`: Return the last match.
320+
* `join`: Join the matches into a single string.
321+
311322
The plugin will try to match the `body` regex to the text between the `start` and `end` markers.
312323

313324
**Example Invoice**
@@ -339,7 +350,9 @@ headings. A template to capture these fields may look like:
339350
tables:
340351
- start: Hotel Details\s+Check In\s+Check Out\s+Rooms
341352
end: Booking ID
342-
body: (?P<hotel_details>[\S ]+),\s+(?P<date_check_in>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<date_check_out>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<amount_rooms>\d+)
353+
body: (?P<hotel_details>[\S ]+),\s+(?P<date_check_in>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<date_check_out>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<qty_rooms>\d+)
354+
types:
355+
qty_rooms: int
343356
- start: Booking ID\s+Payment Mode
344357
end: DESCRIPTION
345358
body: (?P<booking_id>\w+)\s+(?P<payment_method>(?:\w+ ?)*)
@@ -350,6 +363,19 @@ By default, all fields are parsed as strings. The `tables` plugin
350363
supports the `amount` and `date` field naming conventions to convert
351364
data types.
352365

366+
The table plugin supports the grouping options in case there are multiple matches.
367+
This is usefull when one wants to sum the numbers in a column, Example:
368+
```yaml
369+
tables:
370+
- start: Basic example to sum a number
371+
end: with the help of the table plugin
372+
body: (?P<random_num_to_sum>\d+\.\d+)
373+
fields:
374+
random_num_to_sum:
375+
group: sum
376+
type: float
377+
```
378+
353379
### Options
354380

355381
Everything under `options` is optional. We expect to add more options in

src/invoice2data/extract/plugins/tables.py

+47-23
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from typing import Dict
88
from typing import Optional
99

10+
from ..utils import _apply_grouping
11+
1012

1113
logger = getLogger(__name__)
1214

@@ -21,11 +23,12 @@ def extract(
2123
Args:
2224
self (InvoiceTemplate): The current instance of the class. # noqa: DOC103
2325
content (str): The content of the invoice.
24-
output (Dict[str, Any]): A dictionary to store the extracted data.
26+
output (Dict[str, Any]): The updated output dictionary with extracted
27+
data or None if parsing fails.
2528
2629
Returns:
27-
Optional[Dict[str, Any]]: The updated output dictionary with extracted
28-
data, or None if date parsing fails.
30+
Optional[List[Any]]: The extracted data as a list of dictionaries, or None if table parsing fails.
31+
Each dictionary represents a row in the table.
2932
"""
3033
for i, table in enumerate(self["tables"]):
3134
logger.debug("Testing Rules set #%s", i)
@@ -41,8 +44,18 @@ def extract(
4144
continue
4245

4346
# Process table lines
44-
if not _process_table_lines(self, table, table_body, output):
45-
return None # Return None if date parsing fails
47+
table_data = _process_table_lines(self, table, table_body)
48+
if table_data is None:
49+
continue
50+
51+
# Apply grouping to individual fields within table_data
52+
for field, field_settings in table.get("fields", {}).items():
53+
if "group" in field_settings:
54+
grouped_value = _apply_grouping(field_settings, table_data.get(field))
55+
if grouped_value is not None:
56+
table_data[field] = grouped_value
57+
58+
output.update(table_data)
4659

4760
return output
4861

@@ -104,29 +117,28 @@ def _process_table_lines(
104117
self: "OrderedDict[str, Any]",
105118
table: Dict[str, Any],
106119
table_body: str,
107-
output: Dict[str, Any],
108-
) -> bool:
120+
) -> Optional[Dict[str, Any]]:
109121
"""Process the lines within the table body.
110122
111123
Args:
112124
self (InvoiceTemplate): The current instance of the class. # noqa: DOC103
113125
table (Dict[str, Any]): The validated table settings.
114126
table_body (str): The extracted table body.
115-
output (Dict[str, Any]): A dictionary to store the extracted data.
116127
117128
Returns:
118-
bool: True if processing is successful, False if date parsing fails.
129+
List[Dict[str, Any]]: A list of dictionaries, where each dictionary
130+
represents a row in the table.
119131
"""
120132
types = table.get("types", {})
121133
no_match_found = True
122-
134+
line_output: Dict[str, Any] = {}
123135
for line in re.split(table["line_separator"], table_body):
124136
if not line.strip("").strip("\n") or line.isspace():
125137
continue
126138

127139
# Correct the function call and return logic
128-
if not _process_table_line(self, table, line, types, output):
129-
return False # Return False immediately if date parsing fails
140+
if not _process_table_line(self, table, line, types, line_output):
141+
return None # Return None immediately if line parsing fails
130142
else:
131143
no_match_found = (
132144
False # Update no_match_found only if line processing is successful
@@ -137,10 +149,11 @@ def _process_table_lines(
137149
"\033[1;43mWarning\033[0m regex=\033[91m*%s*\033[0m doesn't match anything!",
138150
table["body"],
139151
)
140-
return True
141152

153+
return line_output
142154

143-
def _process_table_line(
155+
156+
def _process_table_line( # noqa: C901
144157
self: "OrderedDict[str, Any]",
145158
table: Dict[str, Any],
146159
line: str,
@@ -162,9 +175,6 @@ def _process_table_line(
162175
match = re.search(table["body"], line)
163176
if match:
164177
for field, value in match.groupdict().items():
165-
if field in output:
166-
continue
167-
168178
logger.debug(
169179
(
170180
"field=\033[1m\033[93m%s\033[0m |"
@@ -177,18 +187,32 @@ def _process_table_line(
177187
)
178188

179189
if field.startswith("date") or field.endswith("date"):
180-
output[field] = self.parse_date(value) # type: ignore[attr-defined]
181-
if not output[field]:
190+
value = self.parse_date(value) # type: ignore[attr-defined]
191+
if not value:
182192
logger.error("Date parsing failed on date *%s*", value)
183193
return False
184194
elif field.startswith("amount"):
185-
output[field] = self.parse_number(value) # type: ignore[attr-defined]
195+
value = self.parse_number(value) # type: ignore[attr-defined]
186196
elif field in types:
187-
# Access types as a dictionary
188-
output[field] = self.coerce_type(value, types[field]) # type: ignore[attr-defined]
197+
value = self.coerce_type(value, types[field]) # type: ignore[attr-defined]
198+
elif table.get("fields"):
199+
# Writing templates is hard. So we also support the following format
200+
# In case someone mixup syntax
201+
# fields:
202+
# example_field:
203+
# type: float
204+
# group: sum
205+
field_set = table["fields"].get(field, {})
206+
if "type" in field_set:
207+
value = self.coerce_type(value, field_set.get("type")) # type: ignore[attr-defined]
208+
209+
if field in output:
210+
# Ensure output[field] is a list before appending
211+
if not isinstance(output[field], list):
212+
output[field] = [output[field]]
213+
output[field].append(value)
189214
else:
190215
output[field] = value
191-
192216
# Return True if a match is found and processed successfully
193217
return True
194218
else:

tests/custom/table-groups.json

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"issuer": "Table Groups Tests",
4+
"date": "2024-12-20",
5+
"invoice_number": "007/10/2024",
6+
"amount": 123.4,
7+
"currency": "EUR",
8+
"hotel_details": [
9+
"OYO 4189 Resort Nanganallur",
10+
"OYO 4189 Resort Nanganallur Suite A"
11+
],
12+
"date_check_in": "2024-01-08",
13+
"date_check_out": "2024-12-31",
14+
"qty_rooms": 2,
15+
"line_tax_percent": ["1%", "2%", "0%"],
16+
"lamount_tax": ["3.00", "2.00", "0.00"],
17+
"random_num_to_sum": 11.01,
18+
"desc": "Invoice from Table Groups Tests"
19+
}
20+
]

tests/custom/table-groups.txt

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Issue date: 2024-12-20
2+
Issuer: Table Group Tests
3+
Invoice number: 007/10/2024
4+
Total: 123.40 EUR
5+
6+
Table basic
7+
8+
Simple table start
9+
Tax precentage amount qty
10+
1% 3.00 7.00
11+
2% 2.00 4.00
12+
0% 0.00 0.01
13+
Simple table end
14+
15+
16+
17+
Sample data below to test advanced grouping functions of table parser.
18+
19+
Guest Name: Sanjay
20+
21+
Hotel Details Check In Check Out Rooms
22+
OYO 4189 Resort Nanganallur, 01/08/2024 01/01/2018 1
23+
OYO 4189 Resort Nanganallur Suite A, 31/12/2017 31/12/2024 1
24+
25,Vembuliamman Koil Street,, Pazhavanthangal, Chennai
25+
Booking ID Payment Mode
26+
IBZY2087 Cash at Hotel
27+
28+
29+
invoice2data --input-reader=text --debug ./table-groups.txt
30+
31+
invoice2data ./table-groups.txt --debug -t ./templates
+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# SPDX-License-Identifier: MIT
2+
issuer: Table Groups Tests
3+
keywords:
4+
- Table basic
5+
- Simple table start
6+
7+
tables:
8+
- start: Hotel Details\s+Check In\s+Check Out\s+Rooms
9+
end: Booking ID
10+
body: (?P<hotel_details>[\S ]+),\s+(?P<date_check_in>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<date_check_out>\d{1,2}\/\d{1,2}\/\d{4})\s+(?P<qty_rooms>\d)
11+
types:
12+
qty_rooms: int
13+
fields:
14+
qty_rooms:
15+
group: sum
16+
date_check_in:
17+
group: first
18+
date_check_out:
19+
group: last
20+
- start: Tax precentage amount qty
21+
end: Simple table end
22+
body: (?P<line_tax_percent>\d[%])\s+(?P<lamount_tax>\d\.\d{2})\s+(?P<random_num_to_sum>\d\.\d{2})
23+
fields:
24+
random_num_to_sum:
25+
group: sum
26+
# type: float # This is also supported
27+
types:
28+
random_num_to_sum: float # this is supported
29+
30+
fields:
31+
date:
32+
parser: regex
33+
regex: Issue date:\s*(\d{4}-\d{2}-\d{2})
34+
type: date
35+
invoice_number:
36+
parser: regex
37+
regex: Invoice number:\s*([\d/]+)
38+
amount:
39+
parser: regex
40+
regex: Total:\s*(\d+\.\d\d)
41+
type: float
42+
options:
43+
currency: EUR
44+
date_formats:
45+
- "%Y-%m-%d"
46+
decimal_separator: "."

0 commit comments

Comments
 (0)