5
5
from sklearn .preprocessing import StandardScaler
6
6
from sklearn .model_selection import train_test_split
7
7
import numpy as np
8
+ import pickle
8
9
9
10
def load_and_preprocess_data (csv_files_path ):
10
11
# Load all CSV files
@@ -35,10 +36,13 @@ def load_and_preprocess_data(csv_files_path):
35
36
raise ValueError ("The target label column is not found in the dataset." )
36
37
37
38
# Encode categorical variables
38
- data [label_column ] = data [label_column ].astype ('category' ).cat .codes
39
+ data [label_column ] = data [label_column ].astype ('category' )
40
+ class_mapping = dict (enumerate (data [label_column ].cat .categories ))
41
+ data [label_column ] = data [label_column ].cat .codes
39
42
40
43
# Print unique values of the target labels
41
44
print (f"Unique target labels: { data [label_column ].unique ()} " )
45
+ print (f"Class mapping: { class_mapping } " )
42
46
43
47
# Replace infinite values with NaN
44
48
data .replace ([np .inf , - np .inf ], np .nan , inplace = True )
@@ -61,4 +65,16 @@ def load_and_preprocess_data(csv_files_path):
61
65
# Split data into training and testing sets
62
66
X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 , random_state = 42 )
63
67
64
- return X_train , X_test , y_train , y_test , scaler
68
+ # Save the class mapping, number of features, and feature names
69
+ metadata = {
70
+ 'num_features' : X_train .shape [1 ],
71
+ 'num_classes' : len (class_mapping ),
72
+ 'class_mapping' : class_mapping ,
73
+ 'feature_names' : list (X .columns )
74
+ }
75
+ with open ('nids/model_metadata.pkl' , 'wb' ) as f :
76
+ pickle .dump (metadata , f )
77
+
78
+ print ("Metadata (number of features, classes, class mapping and faeture names) saved. " )
79
+
80
+ return X_train , X_test , y_train , y_test , scaler
0 commit comments