Ransomware attack detection using multiple machine learning models
#Dataset link: https://zenodo.org/records/13890887
#text size, and font family setup
import matplotlib.pyplot as plt
import matplotlib
# Set font size and family for the entire figure
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['font.family'] = 'serif'
import pandas as pd
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Machine Learning Lab/Final_Dataset_without_duplicate.csv') #download and paste here the link from google drive
# Preview shape and column names
print("Dataset loaded successfully.")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
# Show first few rows
df.head()
df['Class'].value_counts()
df['Category'].value_counts()
df['Family'].value_counts()
# Drop identifier columns
drop_cols = ['md5', 'sha1', 'file_extension']
# Create df1 for 'Class' prediction
df1 = df.drop(columns=drop_cols + ['Category', 'Family'])
df1 = df1.select_dtypes(include=['int64', 'float64']).join(df['Class'])
# Create df2 for 'Category' prediction
df2 = df.drop(columns=drop_cols + ['Class', 'Family'])
df2 = df2.select_dtypes(include=['int64', 'float64']).join(df['Category'])
# Create df3 for 'Family' prediction
df3 = df.drop(columns=drop_cols + ['Class', 'Category'])
df3 = df3.select_dtypes(include=['int64', 'float64']).join(df['Family'])
print("df1, df2, df3 created successfully:")
print("df1 shape (Class):", df1.shape)
print("df2 shape (Category):", df2.shape)
print("df3 shape (Family):", df3.shape)
#df1, df2, df3 created successfully:
#df1 shape (Class): (21752, 19)
#df2 shape (Category): (21752, 19)
#df3 shape (Family): (21752, 19)
#Binary Class Classification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# 1. Feature-label split
X = df1.drop(columns=['Class'])
y = df1['Class']
# 2. Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# 3. Split and scale
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 4. Define models
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
# 5. Train, Predict, Evaluate
plt.figure(figsize=(10, 6))
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]
print(f"\n {name} Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
# 6. Plot ROC Curve
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Class (Benign vs Malware)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
#ROC Curve for Binary Class Classification
#Multiclass Classification (4 Classes)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
# 1. Feature-label split
X = df2.drop(columns=['Category'])
y = df2['Category']
# 2. Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
# 3. One-hot encode for ROC-AUC (multiclass)
y_binarized = label_binarize(y_encoded, classes=range(len(class_names)))
# 4. Train-test split + scale
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 5. Models
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
# 6. Train and Evaluate
plt.figure(figsize=(10, 6))
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)
print(f"\n {name} Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro) :", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro) :", f1_score(y_test, y_pred, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ROC AUC (macro-average)
y_test_bin = label_binarize(y_test, classes=range(len(class_names)))
auc_score = roc_auc_score(y_test_bin, y_prob, average="macro", multi_class="ovr")
# ROC curve (plot only first 3 classes to avoid clutter)
for i in range(min(3, len(class_names))):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
plt.plot(fpr, tpr, label=f'{name} - {class_names[i]} (AUC={auc(fpr, tpr):.2f})')
# 7. Plot ROC Curve
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Category (Multiclass)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
#ROC Curve for 4 class Classification
#Multiclass Classification (14 Classes) from sklearn.metrics import classification_report # 1. Feature-label split X = df3.drop(columns=['Family']) y = df3['Family'] # 2. Label encode le = LabelEncoder() y_encoded = le.fit_transform(y) class_names = le.classes_ # 3. Binarize labels for ROC y_binarized = label_binarize(y_encoded, classes=range(len(class_names))) # 4. Train-test split + scaling X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 5. Models models = { 'Decision Tree': DecisionTreeClassifier(random_state=42), 'KNN': KNeighborsClassifier(n_neighbors=5), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42) } # 6. Train and evaluate plt.figure(figsize=(12, 7)) for name, model in models.items(): model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) y_prob = model.predict_proba(X_test_scaled) print(f"\n {name} Metrics:") print("Accuracy :", accuracy_score(y_test, y_pred)) print("Precision (macro):", precision_score(y_test, y_pred, average='macro')) print("Recall (macro) :", recall_score(y_test, y_pred, average='macro')) print("F1 Score (macro) :", f1_score(y_test, y_pred, average='macro')) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) print("\nClassification Report (Top 5 Classes):\n", classification_report( y_test, y_pred, target_names=class_names, zero_division=0 )[:800]) # Print shortened version # ROC for first 3 classes y_test_bin = label_binarize(y_test, classes=range(len(class_names))) for i in range(min(3, len(class_names))): fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i]) plt.plot(fpr, tpr, label=f'{name} - {class_names[i]} (AUC={auc(fpr, tpr):.2f})') # 7. Plot ROC plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve - Family Classification (Multiclass)') plt.legend(loc='best') plt.grid(True) plt.tight_layout() plt.show()
#Roc Curve
No comments:
Post a Comment