|
1 | 1 | import tabula
|
2 | 2 | import pandas as pd
|
3 | 3 | import os
|
| 4 | +import logging |
4 | 5 |
|
5 |
| -def pdf_to_csv(pdf_path, output_dir): |
6 |
| - # Extract tables from the PDF |
7 |
| - tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True) |
8 |
| - |
9 |
| - # Ensure the output directory exists |
10 |
| - os.makedirs(output_dir, exist_ok=True) |
| 6 | +def setup_logging(log_file_path): |
| 7 | + # Create a custom logger |
| 8 | + logger = logging.getLogger() |
| 9 | + logger.setLevel(logging.INFO) |
11 | 10 |
|
12 |
| - for table_num, table in enumerate(tables): |
13 |
| - # Define the output CSV file path |
14 |
| - csv_file_path = os.path.join(output_dir, f"table_{table_num + 1}.csv") |
| 11 | + # Create handlers |
| 12 | + file_handler = logging.FileHandler(log_file_path, mode='w') |
| 13 | + console_handler = logging.StreamHandler() |
| 14 | + |
| 15 | + # Set log level for handlers |
| 16 | + file_handler.setLevel(logging.INFO) |
| 17 | + console_handler.setLevel(logging.INFO) |
| 18 | + |
| 19 | + # Create formatters and add them to handlers |
| 20 | + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') |
| 21 | + file_handler.setFormatter(formatter) |
| 22 | + console_handler.setFormatter(formatter) |
| 23 | + |
| 24 | + # Add handlers to the logger |
| 25 | + logger.addHandler(file_handler) |
| 26 | + logger.addHandler(console_handler) |
| 27 | + |
| 28 | + return logger |
| 29 | + |
| 30 | +def pdf_to_csv_with_custom_transformation(pdf_path, output_csv_path, log_file_path): |
| 31 | + # Set up logging |
| 32 | + logger = setup_logging(log_file_path) |
| 33 | + |
| 34 | + logger.info(f"Starting PDF to CSV conversion for: {pdf_path}") |
| 35 | + |
| 36 | + try: |
| 37 | + # Extract tables from the PDF |
| 38 | + tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True) |
| 39 | + |
| 40 | + # Initialize an empty DataFrame to hold all tables |
| 41 | + combined_df = pd.DataFrame() |
15 | 42 |
|
16 |
| - # Save the table to a CSV file |
17 |
| - table.to_csv(csv_file_path, index=False) |
| 43 | + for table_num, table in enumerate(tables): |
| 44 | + # Check if table has enough columns to process (at least 6 columns) |
| 45 | + if table.shape[1] < 6: |
| 46 | + logger.warning(f"Table {table_num + 1} does not have enough columns to relocate. Skipping table.") |
| 47 | + continue |
18 | 48 |
|
19 |
| - print(f"PDF tables converted to CSV files successfully. Output directory: {output_dir}") |
| 49 | + # Perform the column relocation |
| 50 | + part1 = table.iloc[:, :3] # Columns 1, 2, and 3 |
| 51 | + part2 = table.iloc[:, 3:6] # Columns 4, 5, and 6 |
20 | 52 |
|
21 |
| -# Define the path for the input PDF and the output directory |
22 |
| -input_pdf_path = "C:\\Documents\\seuarquivoaqui" |
23 |
| -output_csv_directory = "C:\\Documents" |
| 53 | + # Rename columns for part2 to match part1 |
| 54 | + part2.columns = part1.columns |
| 55 | + |
| 56 | + # Concatenate part2 below part1 |
| 57 | + new_table = pd.concat([part1, part2], ignore_index=True) |
| 58 | + |
| 59 | + # Append the transformed table to the combined DataFrame |
| 60 | + combined_df = pd.concat([combined_df, new_table], ignore_index=True) |
| 61 | + |
| 62 | + # Log the table processing |
| 63 | + logger.info(f"Processed table {table_num + 1} with shape {new_table.shape} after transformation") |
| 64 | + |
| 65 | + # Ensure the output directory exists |
| 66 | + os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) |
| 67 | + |
| 68 | + # Save the combined DataFrame to a single CSV file |
| 69 | + combined_df.to_csv(output_csv_path, index=False) |
| 70 | + |
| 71 | + logger.info(f"PDF to CSV conversion completed successfully. Combined file saved as: {output_csv_path}") |
| 72 | + except Exception as e: |
| 73 | + logger.error(f"Error during PDF to CSV conversion: {str(e)}") |
| 74 | + |
| 75 | +# Define the path for the input PDF, output CSV file, and log file |
| 76 | +input_pdf_path = "sample_with_tables.pdf" |
| 77 | +output_csv_path = "output_csv_files/combined_output.csv" |
| 78 | +log_file_path = "conversion_log.txt" |
24 | 79 |
|
25 | 80 | # Perform the conversion
|
26 |
| -pdf_to_csv(input_pdf_path, output_csv_directory) |
| 81 | +pdf_to_csv_with_custom_transformation(input_pdf_path, output_csv_path, log_file_path) |
| 82 | + |
| 83 | +print(f"PDF tables converted to a single CSV file successfully. Output file: {output_csv_path}") |
| 84 | +print(f"Log file created: {log_file_path}") |
0 commit comments