Sunday, July 20, 2025

Macro vs weighted metrics in classification for machine learning

Macro vs weighted metrics in classification for machine learning 


Metric Type Meaning
Macro Calculates metric individually per class, then takes the unweighted average. Treats all classes equally, regardless of size.
Weighted Calculates metric per class, then takes the average weighted by class support (number of true samples per class). Large classes have more influence.

Example Dataset:

Class True samples (support) Precision Recall F1-score
Cat 50 0.9 0.8 0.85
Dog 40 0.6 0.7 0.65
Tiger 10 0.5 0.6 0.55






Final Result:

Metric Macro Average Weighted Average
F1-score 0.683 0.74



When to use and What?

Scenario Recommended Metric
Class imbalance exists Weighted
You care equally about all classes Macro
You want to compare per-class fairness Macro
You want an overall accuracy-type proxy Weighted


Coding:

from sklearn.metrics import f1_score

# Example:
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')

Md. Alamgir Hossain

MSc in ICT, IICT, BUET

Ransomware attack detection using multiple machine learning models

 Ransomware attack detection using multiple machine learning models 


#Dataset link: https://zenodo.org/records/13890887


#text size, and font family setup import matplotlib.pyplot as plt import matplotlib # Set font size and family for the entire figure matplotlib.rcParams['font.size'] = 12 matplotlib.rcParams['font.family'] = 'serif' import pandas as pd from google.colab import drive # Mount Google Drive drive.mount('/content/drive') # Load dataset df = pd.read_csv('/content/drive/MyDrive/Machine Learning Lab/Final_Dataset_without_duplicate.csv') #download and paste here the link from google drive # Preview shape and column names print("Dataset loaded successfully.") print("Shape:", df.shape) print("Columns:", df.columns.tolist()) # Show first few rows df.head() df['Class'].value_counts() df['Category'].value_counts() df['Family'].value_counts() # Drop identifier columns drop_cols = ['md5', 'sha1', 'file_extension'] # Create df1 for 'Class' prediction df1 = df.drop(columns=drop_cols + ['Category', 'Family']) df1 = df1.select_dtypes(include=['int64', 'float64']).join(df['Class']) # Create df2 for 'Category' prediction df2 = df.drop(columns=drop_cols + ['Class', 'Family']) df2 = df2.select_dtypes(include=['int64', 'float64']).join(df['Category']) # Create df3 for 'Family' prediction df3 = df.drop(columns=drop_cols + ['Class', 'Category']) df3 = df3.select_dtypes(include=['int64', 'float64']).join(df['Family']) print("df1, df2, df3 created successfully:") print("df1 shape (Class):", df1.shape) print("df2 shape (Category):", df2.shape) print("df3 shape (Family):", df3.shape) #df1, df2, df3 created successfully: #df1 shape (Class): (21752, 19) #df2 shape (Category): (21752, 19) #df3 shape (Family): (21752, 19) #Binary Class Classification import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier # 1. Feature-label split X = df1.drop(columns=['Class']) y = df1['Class'] # 2. Encode labels le = LabelEncoder() y_encoded = le.fit_transform(y) # 3. Split and scale X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 4. Define models models = { 'Decision Tree': DecisionTreeClassifier(random_state=42), 'KNN': KNeighborsClassifier(n_neighbors=5), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42) } # 5. Train, Predict, Evaluate plt.figure(figsize=(10, 6)) for name, model in models.items(): model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) y_prob = model.predict_proba(X_test_scaled)[:, 1] print(f"\n {name} Metrics:") print("Accuracy :", accuracy_score(y_test, y_pred)) print("Precision:", precision_score(y_test, y_pred)) print("Recall :", recall_score(y_test, y_pred)) print("F1 Score :", f1_score(y_test, y_pred)) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) # ROC curve fpr, tpr, _ = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})') # 6. Plot ROC Curve plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve - Class (Benign vs Malware)') plt.legend() plt.grid(True) plt.tight_layout() plt.show()

#ROC Curve for Binary Class Classification



#Multiclass Classification (4 Classes) from sklearn.metrics import roc_auc_score from sklearn.preprocessing import label_binarize # 1. Feature-label split X = df2.drop(columns=['Category']) y = df2['Category'] # 2. Encode labels le = LabelEncoder() y_encoded = le.fit_transform(y) class_names = le.classes_ # 3. One-hot encode for ROC-AUC (multiclass) y_binarized = label_binarize(y_encoded, classes=range(len(class_names))) # 4. Train-test split + scale X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 5. Models models = { 'Decision Tree': DecisionTreeClassifier(random_state=42), 'KNN': KNeighborsClassifier(n_neighbors=5), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42) } # 6. Train and Evaluate plt.figure(figsize=(10, 6)) for name, model in models.items(): model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) y_prob = model.predict_proba(X_test_scaled) print(f"\n {name} Metrics:") print("Accuracy :", accuracy_score(y_test, y_pred)) print("Precision (macro):", precision_score(y_test, y_pred, average='macro')) print("Recall (macro) :", recall_score(y_test, y_pred, average='macro')) print("F1 Score (macro) :", f1_score(y_test, y_pred, average='macro')) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) # ROC AUC (macro-average) y_test_bin = label_binarize(y_test, classes=range(len(class_names))) auc_score = roc_auc_score(y_test_bin, y_prob, average="macro", multi_class="ovr") # ROC curve (plot only first 3 classes to avoid clutter) for i in range(min(3, len(class_names))): fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i]) plt.plot(fpr, tpr, label=f'{name} - {class_names[i]} (AUC={auc(fpr, tpr):.2f})') # 7. Plot ROC Curve plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve - Category (Multiclass)') plt.legend() plt.grid(True) plt.tight_layout() plt.show()


#ROC Curve for 4 class Classification



#Multiclass Classification (14 Classes) from sklearn.metrics import classification_report # 1. Feature-label split X = df3.drop(columns=['Family']) y = df3['Family'] # 2. Label encode le = LabelEncoder() y_encoded = le.fit_transform(y) class_names = le.classes_ # 3. Binarize labels for ROC y_binarized = label_binarize(y_encoded, classes=range(len(class_names))) # 4. Train-test split + scaling X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 5. Models models = { 'Decision Tree': DecisionTreeClassifier(random_state=42), 'KNN': KNeighborsClassifier(n_neighbors=5), 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42) } # 6. Train and evaluate plt.figure(figsize=(12, 7)) for name, model in models.items(): model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) y_prob = model.predict_proba(X_test_scaled) print(f"\n {name} Metrics:") print("Accuracy :", accuracy_score(y_test, y_pred)) print("Precision (macro):", precision_score(y_test, y_pred, average='macro')) print("Recall (macro) :", recall_score(y_test, y_pred, average='macro')) print("F1 Score (macro) :", f1_score(y_test, y_pred, average='macro')) print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) print("\nClassification Report (Top 5 Classes):\n", classification_report( y_test, y_pred, target_names=class_names, zero_division=0 )[:800]) # Print shortened version # ROC for first 3 classes y_test_bin = label_binarize(y_test, classes=range(len(class_names))) for i in range(min(3, len(class_names))): fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_prob[:, i]) plt.plot(fpr, tpr, label=f'{name} - {class_names[i]} (AUC={auc(fpr, tpr):.2f})') # 7. Plot ROC plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve - Family Classification (Multiclass)') plt.legend(loc='best') plt.grid(True) plt.tight_layout() plt.show()

#Roc Curve



#Thank You

Md. Alamgir Hossain
MSc in ICT, IICT, BUET









Monday, July 14, 2025

FED-GEM-CN: A federated dual-CNN architecture with contrastive cross-attention for maritime radar intrusion detection

 🚀 Latest research paper!

🧠Title: FED-GEM-CN: A federated dual-CNN architecture with contrastive cross-attention for maritime radar intrusion detection
📖 Journal: Array (Elsevier)
📊 Impact Factor: 4.5 | CiteScore: 10.3 | Quartile: Q1
🔎 Indexed in: SCOPUS, ESCI, and more
🌐 DOI: 10.1016/j.array.2025.100456



#Research_Paper


Md. Alamgir Hossain

Instructor, Skill Morph

MSc. in ICT, IICT, BUET || BSc in CSE, JUST