Skip to content

Commit a4583ce

Browse files
committed
Initial scripts, directory structure and implementation
0 parents  commit a4583ce

12 files changed

+329
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
nids/__pycache__
2+
dataset/MachineLearningCVE

csvs.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import glob
2+
import pandas as pd
3+
4+
csv_files = glob.glob('dataset/MachineLearningCVE/*.csv')
5+
print(f"Found CSV files: {csv_files}")
6+
7+
# Load a single CSV file to inspect its columns
8+
df = pd.read_csv('dataset/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
9+
print(df.columns)
10+

nids/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# nids/__init__.py
2+
3+
from .data_preprocessing import load_and_preprocess_data
4+
from .model import Net, train_model
5+
from .logging import setup_logging, log_prediction
6+
from .prediction import run_prediction
7+
from .retraining import retrain

nids/data_preprocessing.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# nids/data_preprocessing.py
2+
3+
import pandas as pd
4+
import glob
5+
from sklearn.preprocessing import StandardScaler
6+
from sklearn.model_selection import train_test_split
7+
import numpy as np
8+
9+
def load_and_preprocess_data(csv_files_path):
10+
# Load all CSV files
11+
csv_files = glob.glob(csv_files_path)
12+
print(f"Found CSV files: {csv_files}") # Debug print
13+
14+
if not csv_files:
15+
raise FileNotFoundError(f"No CSV files found in the path: {csv_files_path}")
16+
17+
dataframes = [pd.read_csv(file) for file in csv_files]
18+
19+
# Concatenate all dataframes
20+
data = pd.concat(dataframes, ignore_index=True)
21+
print(f"Concatenated Data Shape: {data.shape}") # Debug print
22+
23+
# Handle missing values
24+
data = data.dropna()
25+
26+
# Check for the target label column
27+
possible_label_columns = ['label', 'Label', 'class', 'Class', ' Label']
28+
label_column = None
29+
for col in possible_label_columns:
30+
if col in data.columns:
31+
label_column = col
32+
break
33+
34+
if label_column is None:
35+
raise ValueError("The target label column is not found in the dataset.")
36+
37+
# Encode categorical variables
38+
data[label_column] = data[label_column].astype('category').cat.codes
39+
40+
# Print unique values of the target labels
41+
print(f"Unique target labels: {data[label_column].unique()}")
42+
43+
# Replace infinite values with NaN
44+
data.replace([np.inf, -np.inf], np.nan, inplace=True)
45+
46+
# Convert all columns to numeric, forcing non-numeric to NaN
47+
data = data.apply(pd.to_numeric, errors='coerce')
48+
49+
# Drop rows with NaN values
50+
data.dropna(inplace=True)
51+
52+
# Normalize numerical features
53+
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
54+
scaler = StandardScaler()
55+
data[numerical_features] = scaler.fit_transform(data[numerical_features])
56+
57+
# Split features and labels
58+
X = data.drop(columns=[label_column])
59+
y = data[label_column]
60+
61+
# Split data into training and testing sets
62+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
63+
64+
return X_train, X_test, y_train, y_test, scaler

nids/logging.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# nids/logging.py
2+
3+
import logging
4+
5+
def setup_logging():
6+
# Configure logging
7+
logging.basicConfig(filename='nids_logs.log', level=logging.INFO,
8+
format='%(asctime)s:%(levelname)s:%(message)s')
9+
10+
def log_prediction(data, prediction):
11+
logging.info(f'Data: {data}, Prediction: {prediction.item()}')

nids/model.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# nids/model.py
2+
3+
import torch
4+
import torch.nn as nn
5+
import torch.optim as optim
6+
import pandas as pd
7+
8+
class Net(nn.Module):
9+
def __init__(self, input_size, num_classes):
10+
super(Net, self).__init__()
11+
self.fc1 = nn.Linear(input_size, 128)
12+
self.fc2 = nn.Linear(128, 64)
13+
self.fc3 = nn.Linear(64, num_classes)
14+
15+
def forward(self, x):
16+
x = torch.relu(self.fc1(x))
17+
x = torch.relu(self.fc2(x))
18+
x = self.fc3(x)
19+
return x
20+
21+
def train_model(X_train, y_train):
22+
# Convert data to numpy arrays before creating tensors
23+
X_train_array = X_train.to_numpy()
24+
y_train_array = y_train.to_numpy()
25+
26+
# Convert data to PyTorch tensors
27+
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32)
28+
y_train_tensor = torch.tensor(y_train_array, dtype=torch.long)
29+
30+
# Create DataLoader
31+
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
32+
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
33+
34+
# Get the number of unique classes
35+
num_classes = len(pd.unique(y_train))
36+
37+
# Initialize the model, loss function, and optimizer
38+
model = Net(X_train.shape[1], num_classes)
39+
criterion = nn.CrossEntropyLoss()
40+
optimizer = optim.Adam(model.parameters(), lr=0.001)
41+
42+
# Training loop
43+
for epoch in range(20): # Number of epochs
44+
for inputs, labels in train_loader:
45+
optimizer.zero_grad()
46+
outputs = model(inputs)
47+
loss = criterion(outputs, labels)
48+
loss.backward()
49+
optimizer.step()
50+
print(f'Epoch {epoch+1}/{20}, Loss: {loss.item()}')
51+
52+
return model, num_classes

nids/prediction.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# nids/prediction.py
2+
3+
from kafka import KafkaConsumer
4+
import torch
5+
import pandas as pd
6+
import json
7+
from nids.model import Net
8+
from nids.logging import log_prediction
9+
from sklearn.preprocessing import StandardScaler
10+
11+
def preprocess_data(data, scaler):
12+
data = pd.DataFrame([data])
13+
data = pd.get_dummies(data)
14+
data = scaler.transform(data)
15+
return torch.tensor(data, dtype=torch.float32)
16+
17+
def run_prediction(model, scaler):
18+
# Initialize Kafka consumer
19+
consumer = KafkaConsumer('network_traffic',
20+
bootstrap_servers='localhost:9092',
21+
value_deserializer=lambda v: json.loads(v.decode('utf-8')))
22+
23+
model.eval()
24+
# Real-time prediction loop
25+
for message in consumer:
26+
data = message.value
27+
data_tensor = preprocess_data(data, scaler)
28+
29+
# Make prediction
30+
with torch.no_grad():
31+
output = model(data_tensor)
32+
_, prediction = torch.max(output, 1)
33+
log_prediction(data, prediction)
34+
print(f'Prediction: {prediction.item()}')

nids/retraining.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# nids/retraining.py
2+
3+
import pandas as pd
4+
import pickle
5+
import torch
6+
import torch.nn as nn
7+
import torch.optim as optim
8+
from torch.utils.data import DataLoader, TensorDataset
9+
from sklearn.model_selection import train_test_split
10+
from sklearn.preprocessing import StandardScaler
11+
from nids.model import Net
12+
13+
def load_and_preprocess_data(file_path):
14+
data = pd.read_csv(file_path)
15+
data = data.drop(columns=['timestamp'])
16+
X = data.drop(columns=['label'])
17+
y = data['label']
18+
X = pd.get_dummies(X)
19+
scaler = StandardScaler()
20+
X = scaler.fit_transform(X)
21+
return train_test_split(X, y, test_size=0.2, random_state=42), scaler
22+
23+
def retrain_model(model, train_loader):
24+
criterion = nn.CrossEntropyLoss()
25+
optimizer = optim.Adam(model.parameters(), lr=0.001)
26+
for epoch in range(10): # Fewer epochs for incremental training
27+
for inputs, labels in train_loader:
28+
optimizer.zero_grad()
29+
outputs = model(inputs)
30+
loss = criterion(outputs, labels)
31+
loss.backward()
32+
optimizer.step()
33+
return model
34+
35+
def retrain(csv_files_path):
36+
(X_train, X_test, y_train, y_test), scaler = load_and_preprocess_data(csv_files_path)
37+
38+
# Convert data to PyTorch tensors
39+
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
40+
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
41+
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
42+
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
43+
44+
# Load the existing model
45+
with open('nids/model_metadata.pkl', 'rb') as f:
46+
metadata = pickle.load(f)
47+
num_features = metadata['num_features']
48+
num_classes = metadata['num_classes']
49+
50+
model = Net(num_features, num_classes)
51+
model.load_state_dict(torch.load('nids/model.pth'))
52+
53+
# Retrain the model
54+
model = retrain_model(model, train_loader)
55+
56+
# Save the updated model
57+
torch.save(model.state_dict(), 'nids/updated_model.pth')
58+
59+
# Save the scaler and metadata
60+
with open('nids/scaler.pkl', 'wb') as f:
61+
pickle.dump(scaler, f)
62+
with open('nids/model_metadata.pkl', 'wb') as f:
63+
pickle.dump(metadata, f)
64+
65+
print("Model, scaler, and metadata (number of features and classes) updated successfully.")

requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
pandas
2+
scikit-learn
3+
torch
4+
kafka-python

retrain_and_run.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# retrain_and_run.py
2+
3+
import torch
4+
import pickle
5+
from nids import Net, setup_logging, run_prediction, retrain
6+
7+
if __name__ == '__main__':
8+
# Setup logging
9+
setup_logging()
10+
11+
# Retrain the model
12+
retrain('nids/new_data_directory/*.csv')
13+
14+
# Load the number of features and classes
15+
with open('nids/model_metadata.pkl', 'rb') as f:
16+
metadata = pickle.load(f)
17+
num_features = metadata['num_features']
18+
num_classes = metadata['num_classes']
19+
20+
# Load the updated model and scaler
21+
model = Net(input_size=num_features, num_classes=num_classes)
22+
model.load_state_dict(torch.load('nids/updated_model.pth'))
23+
24+
with open('nids/scaler.pkl', 'rb') as f:
25+
scaler = pickle.load(f)
26+
27+
# Run real-time prediction
28+
run_prediction(model, scaler)

run.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# run.py
2+
3+
import torch
4+
import pickle
5+
from nids import Net, setup_logging, run_prediction
6+
7+
if __name__ == '__main__':
8+
# Setup logging
9+
setup_logging()
10+
11+
# Load the number of features and classes
12+
with open('nids/model_metadata.pkl', 'rb') as f:
13+
metadata = pickle.load(f)
14+
num_features = metadata['num_features']
15+
num_classes = metadata['num_classes']
16+
17+
# Load the model and scaler
18+
model = Net(input_size=num_features, num_classes=num_classes)
19+
model.load_state_dict(torch.load('nids/model.pth'))
20+
21+
with open('nids/scaler.pkl', 'rb') as f:
22+
scaler = pickle.load(f)
23+
24+
# Run real-time prediction
25+
run_prediction(model, scaler)

train.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# train.py
2+
3+
import torch
4+
import pickle
5+
from nids import load_and_preprocess_data, train_model
6+
7+
if __name__ == '__main__':
8+
# Load and preprocess data
9+
X_train, X_test, y_train, y_test, scaler = load_and_preprocess_data('dataset/MachineLearningCVE/*.csv')
10+
11+
# Train the model
12+
model, num_classes = train_model(X_train, y_train)
13+
14+
# Save the model and scaler
15+
torch.save(model.state_dict(), 'nids/model.pth')
16+
with open('nids/scaler.pkl', 'wb') as f:
17+
pickle.dump(scaler, f)
18+
19+
# Save the number of features and classes
20+
with open('nids/model_metadata.pkl', 'wb') as f:
21+
metadata = {
22+
'num_features': X_train.shape[1],
23+
'num_classes': num_classes
24+
}
25+
pickle.dump(metadata, f)
26+
27+
print("Model, scaler, and metadata (number of features and classes) saved successfully.")

0 commit comments

Comments
 (0)