import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
def random_forest_classification_example():
# Generate synthetic classification dataset
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
n_classes=3,
random_state=42
)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the random forest classifier
rf_clf = RandomForestClassifier(
n_estimators=100, # Number of trees
max_depth=None, # Maximum depth of trees
min_samples_split=2, # Minimum samples to split a node
min_samples_leaf=1, # Minimum samples at leaf nodes
max_features='sqrt', # Number of features to consider for best split
bootstrap=True, # Use bootstrap samples
oob_score=True, # Calculate out-of-bag score
n_jobs=-1, # Use all available cores
random_state=42
)
rf_clf.fit(X_train, y_train)
# Make predictions
y_pred = rf_clf.predict(X_test)
# Print performance metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Out-of-Bag Score: {rf_clf.oob_score_:.4f}")
# Feature importance analysis
feature_importance = pd.DataFrame({
'feature': [f'Feature {i}' for i in range(X.shape[1])],
'importance': rf_clf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features')
plt.xlabel('Feature Importance')
# plt.show()
# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
# plt.show()
def random_forest_regression_example():
# Generate synthetic regression dataset
np.random.seed(42)
X = np.random.rand(1000, 10)
y = 3*X[:, 0] + 2*X[:, 1]**2 - 4*X[:, 2]*X[:, 3] + np.random.normal(0, 0.1, 1000)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train random forest regressor
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(
n_estimators=100,
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features='auto',
bootstrap=True,
n_jobs=-1,
random_state=42
)
rf_reg.fit(X_train, y_train)
# Make predictions
y_pred = rf_reg.predict(X_test)
# Calculate performance metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"
Regression Metrics:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")
# Feature importance analysis
feature_importance = pd.DataFrame({
'feature': [f'Feature {i}' for i in range(X.shape[1])],
'importance': rf_reg.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Random Forest Regression')
plt.xlabel('Feature Importance')
# plt.show()
def hyperparameter_tuning_example():
# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Grid search for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [10, 20, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=5,
n_jobs=-1,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
print("
Hyperparameter Tuning Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
# Learning curves
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
grid_search.best_estimator_,
X_train, y_train,
cv=5,
n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10)
)
# Plot learning curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.title('Learning Curves')
plt.legend(loc='best')
# plt.show()
if __name__ == "__main__":
print("Running Random Forest Examples...")
print("
1. Classification Example:")
random_forest_classification_example()
print("
2. Regression Example:")
random_forest_regression_example()
print("
3. Hyperparameter Tuning Example:")
hyperparameter_tuning_example()