first commit
This commit is contained in:
@@ -0,0 +1,259 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import json
|
||||
import joblib
|
||||
import os
|
||||
import sys
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from imblearn.over_sampling import SMOTE
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
# Add the project root to the path so we can import from src
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from src import config
|
||||
from src.data_preprocessing import get_preprocessing_pipeline
|
||||
|
||||
|
||||
def train_model(X_train, y_train, X_val=None, y_val=None, use_smote=True):
|
||||
"""
|
||||
Train a model on the given data
|
||||
"""
|
||||
# Get preprocessing pipeline
|
||||
preprocessor = get_preprocessing_pipeline()
|
||||
|
||||
# Handle categorical features before SMOTE
|
||||
print("Preprocessing data...")
|
||||
# Identify categorical columns
|
||||
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
print(f"Categorical columns: {categorical_cols}")
|
||||
|
||||
if use_smote and categorical_cols:
|
||||
# We need to preprocess categorical features before applying SMOTE
|
||||
print("Preprocessing categorical features for SMOTE...")
|
||||
# Create a preprocessing pipeline just for categorical features
|
||||
cat_preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
|
||||
],
|
||||
remainder='passthrough'
|
||||
)
|
||||
|
||||
# Apply preprocessing to training data
|
||||
X_train_processed = cat_preprocessor.fit_transform(X_train)
|
||||
|
||||
# Apply SMOTE to the preprocessed data
|
||||
print("Applying SMOTE to handle class imbalance...")
|
||||
smote = SMOTE(random_state=42)
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
|
||||
|
||||
# For the final pipeline, we'll use the original data and let the full preprocessor handle it
|
||||
X_train_for_pipeline, y_train_for_pipeline = X_train, y_train
|
||||
elif use_smote:
|
||||
# If no categorical features, apply SMOTE directly
|
||||
print("Applying SMOTE to handle class imbalance...")
|
||||
smote = SMOTE(random_state=42)
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
||||
X_train_for_pipeline, y_train_for_pipeline = X_train_resampled, y_train_resampled
|
||||
else:
|
||||
# No SMOTE, use original data
|
||||
X_train_for_pipeline, y_train_for_pipeline = X_train, y_train
|
||||
|
||||
# Create the full pipeline with preprocessing and model
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', RandomForestClassifier(random_state=42))
|
||||
])
|
||||
|
||||
# Define hyperparameters for grid search
|
||||
param_grid = {
|
||||
'classifier__n_estimators': [100, 200],
|
||||
'classifier__max_depth': [None, 10, 20],
|
||||
'classifier__min_samples_split': [2, 5, 10]
|
||||
}
|
||||
|
||||
# Perform grid search with cross-validation
|
||||
print("Performing grid search with cross-validation...")
|
||||
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
|
||||
grid_search.fit(X_train_for_pipeline, y_train_for_pipeline)
|
||||
|
||||
# Get the best model
|
||||
best_model = grid_search.best_estimator_
|
||||
print(f"Best parameters: {grid_search.best_params_}")
|
||||
|
||||
# Evaluate on validation set if provided
|
||||
if X_val is not None and y_val is not None:
|
||||
y_pred = best_model.predict(X_val)
|
||||
print("Validation metrics:")
|
||||
print_metrics(y_val, y_pred)
|
||||
|
||||
return best_model, grid_search.best_params_
|
||||
|
||||
|
||||
def print_metrics(y_true, y_pred):
|
||||
"""
|
||||
Print evaluation metrics
|
||||
"""
|
||||
accuracy = accuracy_score(y_true, y_pred)
|
||||
precision = precision_score(y_true, y_pred)
|
||||
recall = recall_score(y_true, y_pred)
|
||||
f1 = f1_score(y_true, y_pred)
|
||||
|
||||
print(f"Accuracy: {accuracy:.4f}")
|
||||
print(f"Precision: {precision:.4f}")
|
||||
print(f"Recall: {recall:.4f}")
|
||||
print(f"F1 Score: {f1:.4f}")
|
||||
|
||||
# Confusion matrix
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
print("Confusion Matrix:")
|
||||
print(cm)
|
||||
|
||||
return {
|
||||
'accuracy': accuracy,
|
||||
'precision': precision,
|
||||
'recall': recall,
|
||||
'f1': f1,
|
||||
'confusion_matrix': cm.tolist()
|
||||
}
|
||||
|
||||
|
||||
def plot_feature_importance(model, feature_names):
|
||||
"""
|
||||
Plot feature importance
|
||||
"""
|
||||
# Get feature importance from the model
|
||||
if hasattr(model, 'feature_importances_'):
|
||||
importances = model.feature_importances_
|
||||
else:
|
||||
importances = model.named_steps['classifier'].feature_importances_
|
||||
|
||||
# Get the transformed feature names from the pipeline
|
||||
if hasattr(model, 'named_steps'):
|
||||
# For pipeline models, get the feature names from the preprocessor
|
||||
preprocessor = model.named_steps['preprocessor']
|
||||
# Get the transformed feature names
|
||||
transformed_features = []
|
||||
|
||||
# Handle numerical features (they keep their names)
|
||||
numerical_features = preprocessor.transformers_[0][2] # Numerical features list
|
||||
transformed_features.extend(numerical_features)
|
||||
|
||||
# Handle categorical features (they get expanded with one-hot encoding)
|
||||
categorical_features = preprocessor.transformers_[1][2] # Categorical features list
|
||||
categorical_transformer = preprocessor.transformers_[1][1] # OneHotEncoder
|
||||
if hasattr(categorical_transformer, 'get_feature_names_out'):
|
||||
# For newer scikit-learn versions
|
||||
cat_feature_names = categorical_transformer.get_feature_names_out(categorical_features)
|
||||
else:
|
||||
# For older scikit-learn versions
|
||||
cat_feature_names = categorical_transformer.named_steps['onehot'].get_feature_names(categorical_features)
|
||||
transformed_features.extend(cat_feature_names)
|
||||
|
||||
# Handle binary features (they pass through)
|
||||
binary_features = preprocessor.transformers_[2][2] # Binary features list
|
||||
transformed_features.extend(binary_features)
|
||||
|
||||
# Use the transformed feature names
|
||||
feature_names = transformed_features
|
||||
|
||||
# Make sure the lengths match
|
||||
if len(feature_names) != len(importances):
|
||||
print(f"Warning: Feature names length ({len(feature_names)}) doesn't match importances length ({len(importances)})")
|
||||
# Use generic feature names if lengths don't match
|
||||
feature_names = [f'Feature {i}' for i in range(len(importances))]
|
||||
|
||||
# Create a DataFrame for visualization
|
||||
feature_importance = pd.DataFrame({
|
||||
'Feature': feature_names,
|
||||
'Importance': importances
|
||||
}).sort_values('Importance', ascending=False)
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.barplot(x='Importance', y='Feature', data=feature_importance)
|
||||
plt.title('Feature Importance')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(config.MODELS_DIR, 'feature_importance.png'))
|
||||
plt.close()
|
||||
|
||||
return feature_importance
|
||||
|
||||
|
||||
def save_model(model, metadata):
|
||||
"""
|
||||
Save the trained model and its metadata
|
||||
"""
|
||||
# Create models directory if it doesn't exist
|
||||
os.makedirs(config.MODELS_DIR, exist_ok=True)
|
||||
|
||||
# Save the model
|
||||
joblib.dump(model, config.MODEL_PATH)
|
||||
|
||||
# Save metadata
|
||||
with open(config.MODEL_METADATA_PATH, 'w') as f:
|
||||
json.dump(metadata, f, indent=4)
|
||||
|
||||
print(f"Model saved to {config.MODEL_PATH}")
|
||||
print(f"Model metadata saved to {config.MODEL_METADATA_PATH}")
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to train the model
|
||||
"""
|
||||
# Load processed data
|
||||
print("Loading processed training data...")
|
||||
try:
|
||||
train_data = pd.read_csv(config.PROCESSED_TRAIN_DATA_PATH)
|
||||
except FileNotFoundError:
|
||||
print("Processed training data not found. Please run data_preprocessing.py first.")
|
||||
return
|
||||
|
||||
# Split features and target
|
||||
X = train_data.drop('is_fraud', axis=1)
|
||||
y = train_data['is_fraud']
|
||||
|
||||
# Split into training and validation sets
|
||||
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
||||
|
||||
print(f"Training data shape: {X_train.shape}")
|
||||
print(f"Validation data shape: {X_val.shape}")
|
||||
|
||||
# Train the model
|
||||
print("Training the model...")
|
||||
model, best_params = train_model(X_train, y_train, X_val, y_val)
|
||||
|
||||
# Evaluate on validation set
|
||||
print("\nEvaluating on validation set:")
|
||||
y_pred = model.predict(X_val)
|
||||
metrics = print_metrics(y_val, y_pred)
|
||||
|
||||
# Get feature names after preprocessing
|
||||
feature_names = X.columns.tolist()
|
||||
|
||||
# Plot feature importance
|
||||
print("\nPlotting feature importance...")
|
||||
feature_importance = plot_feature_importance(model, feature_names)
|
||||
|
||||
# Save the model and metadata
|
||||
metadata = {
|
||||
'model_type': 'RandomForestClassifier',
|
||||
'best_parameters': best_params,
|
||||
'metrics': metrics,
|
||||
'feature_importance': feature_importance.to_dict(orient='records'),
|
||||
'features': feature_names
|
||||
}
|
||||
|
||||
save_model(model, metadata)
|
||||
|
||||
print("Model training completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user