Ransomware attack detection using multiple machine learning models
#Dataset link: https://zenodo.org/records/13890887
#text size, and font family setup
import matplotlib.pyplot as plt
import matplotlib
# Set font size and family for the entire figure
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['font.family'] = 'serif'
import pandas as pd
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Machine Learning Lab/Final_Dataset_without_duplicate.csv') #download and paste here the link from google drive
# Preview shape and column names
print("Dataset loaded successfully.")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
# Show first few rows
df.head()
df['Class'].value_counts()
df['Category'].value_counts()
df['Family'].value_counts()
# Drop identifier columns
drop_cols = ['md5', 'sha1', 'file_extension']
# Create df1 for 'Class' prediction
df1 = df.drop(columns=drop_cols + ['Category', 'Family'])
df1 = df1.select_dtypes(include=['int64', 'float64']).join(df['Class'])
# Create df2 for 'Category' prediction
df2 = df.drop(columns=drop_cols + ['Class', 'Family'])
df2 = df2.select_dtypes(include=['int64', 'float64']).join(df['Category'])
# Create df3 for 'Family' prediction
df3 = df.drop(columns=drop_cols + ['Class', 'Category'])
df3 = df3.select_dtypes(include=['int64', 'float64']).join(df['Family'])
print("df1, df2, df3 created successfully:")
print("df1 shape (Class):", df1.shape)
print("df2 shape (Category):", df2.shape)
print("df3 shape (Family):", df3.shape)
#df1, df2, df3 created successfully:
#df1 shape (Class): (21752, 19)
#df2 shape (Category): (21752, 19)
#df3 shape (Family): (21752, 19)
#Binary Class Classification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# 1. Feature-label split
X = df1.drop(columns=['Class'])
y = df1['Class']
# 2. Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# 3. Split and scale
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 4. Define models
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
# 5. Train, Predict, Evaluate
plt.figure(figsize=(10, 6))
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]
print(f"\n {name} Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
# 6. Plot ROC Curve
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Class (Benign vs Malware)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
#ROC Curve for Binary Class Classification

#Multiclass Classification (4 Classes)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
# 1. Feature-label split
X = df2.drop(columns=['Category'])
y = df2['Category']
# 2. Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
# 3. One-hot encode for ROC-AUC (multiclass)
y_binarized = label_binarize(y_encoded, classes=range(len(class_names)))
# 4. Train-test split + scale
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 5. Models
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
# 6. Train and Evaluate
plt.figure(figsize=(10, 6))
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)
print(f"\n {name} Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro) :", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro) :", f1_score(y_test, y_pred, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# ROC AUC (macro-average)
y_test_bin = label_binarize(y_test, classes=range(len(class_names)))
auc_score = roc_auc_score(y_test_bin, y_prob, average="macro", multi_class="ovr")
# ROC curve (plot only first 3 classes to avoid clutter)
for i in range(min(3, len(class_names))):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
plt.plot(fpr, tpr, label=f'{name} - {class_names[i]} (AUC={auc(fpr, tpr):.2f})')
# 7. Plot ROC Curve
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Category (Multiclass)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
#ROC Curve for 4 class Classification

#Multiclass Classification (14 Classes)
from sklearn.metrics import classification_report
# 1. Feature-label split
X = df3.drop(columns=['Family'])
y = df3['Family']
# 2. Label encode
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
# 3. Binarize labels for ROC
y_binarized = label_binarize(y_encoded, classes=range(len(class_names)))
# 4. Train-test split + scaling
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 5. Models
models = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}
# 6. Train and evaluate
plt.figure(figsize=(12, 7))
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)
print(f"\n {name} Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro) :", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro) :", f1_score(y_test, y_pred, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report (Top 5 Classes):\n", classification_report(
y_test, y_pred, target_names=class_names, zero_division=0
)[:800]) # Print shortened version
# ROC for first 3 classes
y_test_bin = label_binarize(y_test, classes=range(len(class_names)))
for i in range(min(3, len(class_names))):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
plt.plot(fpr, tpr, label=f'{name} - {class_names[i]} (AUC={auc(fpr, tpr):.2f})')
# 7. Plot ROC
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Family Classification (Multiclass)')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
plt.show()
#Roc Curve
#Thank You
Md. Alamgir Hossain
MSc in ICT, IICT, BUET