Skip to content

Commit 5fbb249

Browse files
authored
Update extrator-de-tabelas.py
Recognize and organize tables and produce a log of the operation.
1 parent 352469e commit 5fbb249

File tree

1 file changed

+74
-16
lines changed

1 file changed

+74
-16
lines changed

extrator-de-tabelas.py

+74-16
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,84 @@
11
import tabula
22
import pandas as pd
33
import os
4+
import logging
45

5-
def pdf_to_csv(pdf_path, output_dir):
6-
# Extract tables from the PDF
7-
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
8-
9-
# Ensure the output directory exists
10-
os.makedirs(output_dir, exist_ok=True)
6+
def setup_logging(log_file_path):
7+
# Create a custom logger
8+
logger = logging.getLogger()
9+
logger.setLevel(logging.INFO)
1110

12-
for table_num, table in enumerate(tables):
13-
# Define the output CSV file path
14-
csv_file_path = os.path.join(output_dir, f"table_{table_num + 1}.csv")
11+
# Create handlers
12+
file_handler = logging.FileHandler(log_file_path, mode='w')
13+
console_handler = logging.StreamHandler()
14+
15+
# Set log level for handlers
16+
file_handler.setLevel(logging.INFO)
17+
console_handler.setLevel(logging.INFO)
18+
19+
# Create formatters and add them to handlers
20+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
21+
file_handler.setFormatter(formatter)
22+
console_handler.setFormatter(formatter)
23+
24+
# Add handlers to the logger
25+
logger.addHandler(file_handler)
26+
logger.addHandler(console_handler)
27+
28+
return logger
29+
30+
def pdf_to_csv_with_custom_transformation(pdf_path, output_csv_path, log_file_path):
31+
# Set up logging
32+
logger = setup_logging(log_file_path)
33+
34+
logger.info(f"Starting PDF to CSV conversion for: {pdf_path}")
35+
36+
try:
37+
# Extract tables from the PDF
38+
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
39+
40+
# Initialize an empty DataFrame to hold all tables
41+
combined_df = pd.DataFrame()
1542

16-
# Save the table to a CSV file
17-
table.to_csv(csv_file_path, index=False)
43+
for table_num, table in enumerate(tables):
44+
# Check if table has enough columns to process (at least 6 columns)
45+
if table.shape[1] < 6:
46+
logger.warning(f"Table {table_num + 1} does not have enough columns to relocate. Skipping table.")
47+
continue
1848

19-
print(f"PDF tables converted to CSV files successfully. Output directory: {output_dir}")
49+
# Perform the column relocation
50+
part1 = table.iloc[:, :3] # Columns 1, 2, and 3
51+
part2 = table.iloc[:, 3:6] # Columns 4, 5, and 6
2052

21-
# Define the path for the input PDF and the output directory
22-
input_pdf_path = "C:\\Documents\\seuarquivoaqui"
23-
output_csv_directory = "C:\\Documents"
53+
# Rename columns for part2 to match part1
54+
part2.columns = part1.columns
55+
56+
# Concatenate part2 below part1
57+
new_table = pd.concat([part1, part2], ignore_index=True)
58+
59+
# Append the transformed table to the combined DataFrame
60+
combined_df = pd.concat([combined_df, new_table], ignore_index=True)
61+
62+
# Log the table processing
63+
logger.info(f"Processed table {table_num + 1} with shape {new_table.shape} after transformation")
64+
65+
# Ensure the output directory exists
66+
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
67+
68+
# Save the combined DataFrame to a single CSV file
69+
combined_df.to_csv(output_csv_path, index=False)
70+
71+
logger.info(f"PDF to CSV conversion completed successfully. Combined file saved as: {output_csv_path}")
72+
except Exception as e:
73+
logger.error(f"Error during PDF to CSV conversion: {str(e)}")
74+
75+
# Define the path for the input PDF, output CSV file, and log file
76+
input_pdf_path = "sample_with_tables.pdf"
77+
output_csv_path = "output_csv_files/combined_output.csv"
78+
log_file_path = "conversion_log.txt"
2479

2580
# Perform the conversion
26-
pdf_to_csv(input_pdf_path, output_csv_directory)
81+
pdf_to_csv_with_custom_transformation(input_pdf_path, output_csv_path, log_file_path)
82+
83+
print(f"PDF tables converted to a single CSV file successfully. Output file: {output_csv_path}")
84+
print(f"Log file created: {log_file_path}")

0 commit comments

Comments
 (0)