import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
# Generate synthetic data
# Normal data (majority)
n_samples_normal = 200
X_normal, _ = make_blobs(n_samples=n_samples_normal, centers=[[2, 2]], cluster_std=0.5, random_state=42)
# Anomalous data (minority, further away)
n_samples_anomalies = 20
# Generate anomalies in two distinct groups for visual clarity
anomalies1, _ = make_blobs(n_samples=n_samples_anomalies // 2, centers=[[-2, -2]], cluster_std=0.3, random_state=42)
anomalies2, _ = make_blobs(n_samples=n_samples_anomalies // 2, centers=[[5, -1]], cluster_std=0.3, random_state=12)
X_anomalies = np.vstack([anomalies1, anomalies2])
# Combine normal data and anomalies
X = np.vstack([X_normal, X_anomalies])
# True labels for coloring (0 for normal, 1 for anomaly - for visualization only, not used by IsolationForest)
y_true = np.concatenate([np.zeros(n_samples_normal), np.ones(n_samples_anomalies)])
# Initialize and fit the Isolation Forest model
# contamination: The amount of contamination of the data set, i.e. the proportion of outliers in the data set.
# Can be set to 'auto' or a float between 0 and 0.5.
# For this example, we know the approximate proportion.
contamination_rate = n_samples_anomalies / (n_samples_normal + n_samples_anomalies)
isolation_forest = IsolationForest(n_estimators=100, contamination=contamination_rate, random_state=42)
isolation_forest.fit(X)
# Predict anomalies (-1 for anomalies, 1 for inliers/normal)
y_pred = isolation_forest.predict(X)
# Convert predictions to 0 (normal) and 1 (anomaly) for easier comparison/visualization
y_pred_binary = np.where(y_pred == -1, 1, 0)
# Visualize the results
plt.figure(figsize=(10, 7))
# Plot normal points detected as normal
plt.scatter(X[(y_true == 0) & (y_pred_binary == 0), 0], X[(y_true == 0) & (y_pred_binary == 0), 1],
c='blue', s=50, label='Normal (Correctly Classified)', alpha=0.7)
# Plot anomalies detected as anomalies
plt.scatter(X[(y_true == 1) & (y_pred_binary == 1), 0], X[(y_true == 1) & (y_pred_binary == 1), 1],
c='red', s=50, marker='x', label='Anomaly (Correctly Detected)', alpha=0.9)
# Plot normal points misclassified as anomalies (False Positives)
plt.scatter(X[(y_true == 0) & (y_pred_binary == 1), 0], X[(y_true == 0) & (y_pred_binary == 1), 1],
c='orange', s=100, marker='s', label='Normal (Misclassified as Anomaly)', alpha=0.9, edgecolors='black')
# Plot anomalies misclassified as normal (False Negatives)
plt.scatter(X[(y_true == 1) & (y_pred_binary == 0), 0], X[(y_true == 1) & (y_pred_binary == 0), 1],
c='green', s=100, marker='P', label='Anomaly (Misclassified as Normal)', alpha=0.9, edgecolors='black')
plt.title('Anomaly Detection with Isolation Forest')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True)
# plt.show()
print(f"Number of anomalies detected: {np.sum(y_pred_binary == 1)}")
print(f"Number of true anomalies: {n_samples_anomalies}")
# Decision function gives an anomaly score for each sample
# anomaly_scores = isolation_forest.decision_function(X)
# print("Anomaly scores (first 10):
", anomaly_scores[:10])