DDoS attack detection with with Decision Tree Algorithm


 # Complete Coding with Decision Tree

#Load and Explore Dataset import pandas as pd from google.colab import drive drive.mount('/content/drive') #Output: Mounted at /content/drive #Dataset Connecting # Load the dataset ds_path = '/content/drive/MyDrive/Machine Learning Lab/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv' df = pd.read_csv(ds_path) # Show basic info print("Dataset shape:", df.shape) print("\nFirst 5 rows:") print(df.head())


# Check for missing or infinite values print("\nMissing values per column:") print(df.isnull().sum()) print("\nColumns with infinite values:") print((df == float('inf')).sum()) #Data Cleaning, Encoding, and Splitting from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder import numpy as np # Step 2.1: Drop rows with NaN or infinite values df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace=True) print("After dropping NaNs and Infs, shape:", df.shape) # Step 2.2: Drop non-feature identifier columns if present drop_cols = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Label'] available_drop_cols = [col for col in drop_cols if col in df.columns] X = df.drop(columns=available_drop_cols, errors='ignore') y = df['Label'] if 'Label' in df.columns else df.iloc[:, -1] # fallback to last column if 'Label' not present # Step 2.3: Encode target labels le = LabelEncoder() y_encoded = le.fit_transform(y) print("Label classes:", le.classes_) # Step 2.4: Encode categorical features in X for col in X.columns: if X[col].dtype == 'object': X[col] = LabelEncoder().fit_transform(X[col]) # Step 2.5: Train-test-validation split (60% train, 20% val, 20% test) X_temp, X_test, y_temp, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded) X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp) # 0.25 of 0.8 = 0.2 # Final check print(f"\nFinal dataset sizes:") print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")



#Train the Decision Tree Model from sklearn.tree import DecisionTreeClassifier # Initialize and train Decision Tree dt_model = DecisionTreeClassifier(random_state=42) dt_model.fit(X_train, y_train) # Predict on validation set y_pred_val = dt_model.predict(X_val) y_prob_val = dt_model.predict_proba(X_val)[:, 1] # for ROC/AUC print("Model training complete. Predictions on validation set ready.") #Model training complete. Predictions on validation set ready. #Evaluation with Full Metrics from sklearn.metrics import ( confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score, matthews_corrcoef ) # Compute confusion matrix cm = confusion_matrix(y_val, y_pred_val) TN, FP, FN, TP = cm.ravel() # Metrics accuracy = accuracy_score(y_val, y_pred_val) precision = precision_score(y_val, y_pred_val) recall = recall_score(y_val, y_pred_val) f1 = f1_score(y_val, y_pred_val) roc_auc = roc_auc_score(y_val, y_prob_val) specificity = TN / (TN + FP) fpr = FP / (FP + TN) fnr = FN / (FN + TP) balanced_acc = balanced_accuracy_score(y_val, y_pred_val) mcc = matthews_corrcoef(y_val, y_pred_val) # Print results print(f"\nAccuracy: {accuracy:.5f}") print(f"Precision (PPV): {precision:.5f}") print(f"Recall (Sensitivity): {recall:.5f}") print(f"F1 Score: {f1:.5f}") print(f"ROC AUC: {roc_auc:.5f}") print(f"Specificity (TNR): {specificity:.5f}") print(f"False Positive Rate: {fpr:.5f}") print(f"False Negative Rate: {fnr:.5f}") print(f"Balanced Accuracy: {balanced_acc:.5f}") print(f"Matthews Corr Coef: {mcc:.5f}")



#Confusion Matrix import seaborn as sns import numpy as np # Title title = "Confusion Matrix - Decision Tree (Validation Set)" # Set style plt.rcParams.update({ 'font.size': 18, 'font.family': 'serif', 'axes.titlesize': 18, 'axes.labelsize': 18, 'xtick.labelsize': 18, 'ytick.labelsize': 18 }) # Plot fig, ax = plt.subplots(figsize=(8, 5)) cmap = sns.color_palette("crest", as_cmap=True) sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, cbar=True, ax=ax, annot_kws={"fontsize": 18}, linewidths=0.5, linecolor='white') ax.set_title(title) ax.set_xlabel("Predicted Labels") ax.set_ylabel("True Labels") ax.set_xticklabels(le.classes_, rotation=45, fontsize=14) ax.set_yticklabels(le.classes_, rotation=0, fontsize=14) # Inner gridlines ax.hlines([1], *ax.get_xlim(), colors='white', linewidth=4) ax.vlines([1], *ax.get_ylim(), colors='white', linewidth=4) plt.tight_layout() plt.show()



#ROC Curve & Classification Report. from sklearn.metrics import classification_report, roc_curve import matplotlib.pyplot as plt # ROC Curve fpr, tpr, thresholds = roc_curve(y_val, y_prob_val) plt.figure(figsize=(7, 3)) plt.plot(fpr, tpr, label=f'DT ROC (AUC = {roc_auc:.5f})', color='blue') plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess') plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate (Recall)") plt.title("ROC Curve - Decision Tree") plt.legend() plt.grid(True) plt.tight_layout() plt.show() # Formatted classification report (5 digits) report_dict = classification_report(y_val, y_pred_val, target_names=le.classes_, output_dict=True) print("\nClassification Report (rounded to 5 digits):\n") print(f"{'Label':<15} {'Precision':>10} {'Recall':>10} {'F1-Score':>10} {'Support':>10}") print("-" * 60) for label, scores in report_dict.items(): if isinstance(scores, dict): precision = f"{scores['precision']:.5f}" recall = f"{scores['recall']:.5f}" f1 = f"{scores['f1-score']:.5f}" support = f"{int(scores['support'])}" print(f"{label:<15} {precision:>10} {recall:>10} {f1:>10} {support:>10}")




#5-Fold Cross-Validation from sklearn.model_selection import StratifiedKFold, cross_val_predict # Stratified 5-fold cross-validation cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Predict probabilities using cross-validation y_cv_prob = cross_val_predict(dt_model, X_train, y_train, cv=cv, method='predict_proba')[:, 1] # Calculate ROC AUC cv_auc = roc_auc_score(y_train, y_cv_prob) print(f"Cross-Validation ROC AUC (5-fold): {cv_auc:.5f}") #Cross-Validation ROC AUC (5-fold): 1.00000


#Thank you and Good Luck!!!

No comments:

Post a Comment