update for model_training script

This commit is contained in:
Ayomide
2025-07-29 10:27:21 +01:00
parent 6c1224eaca
commit 5c5ad60563
7 changed files with 246 additions and 152 deletions
+110 -66
View File
@@ -1,52 +1,105 @@
from flask import Flask, render_template, request from flask import Flask, render_template, request, jsonify
import joblib import joblib
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from datetime import datetime from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Tuple, Union
from dataclasses import dataclass
import logging
from config import MODEL_PATH, FEATURE_CONFIG
# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = Flask(__name__) app = Flask(__name__)
# Load the model @dataclass
try: class PredictionResult:
model = joblib.load('models/fraud_model.pkl') is_fraud: bool
print("Model loaded successfully") probability: float
except Exception as e: confidence: str # "high", "medium", "low"
print(f"Error loading model: {e}")
raise
def preprocess_input(data): class FraudDetectionService:
# Convert to DataFrame def __init__(self, model_path: Path):
df = pd.DataFrame([data]) self.model = self._load_model(model_path)
self.required_features = FEATURE_CONFIG['numeric_features'] + FEATURE_CONFIG['categorical_features']
# Ensure numeric fields are properly converted
numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long'] def _load_model(self, model_path: Path) -> Any:
for field in numeric_fields: """Load trained model with error handling"""
df[field] = pd.to_numeric(df[field], errors='coerce') try:
model = joblib.load(model_path)
# Convert and extract datetime features if not hasattr(model, 'predict'):
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) raise ValueError("Invalid model object - missing predict method")
df['hour'] = df['trans_date_trans_time'].dt.hour logger.info("Model loaded successfully")
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek return model
df['month'] = df['trans_date_trans_time'].dt.month except Exception as e:
logger.error(f"Model loading failed: {str(e)}")
# Calculate age from dob raise
df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9)
df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600) def preprocess_input(self, input_data: Dict[str, Any]) -> pd.DataFrame:
"""Convert and validate input data"""
# Calculate distance safely df = pd.DataFrame([input_data])
df['distance'] = np.sqrt(
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + # Type conversion
(df['long'].astype(float) - df['merch_long'].astype(float))**2 for field in FEATURE_CONFIG['numeric_features']:
) if field in df.columns:
df[field] = pd.to_numeric(df[field], errors='coerce')
# Ensure all expected columns are present
expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance', # Handle datetime features
'category', 'gender', 'job', 'merchant'] if 'trans_date_trans_time' in df.columns:
for col in expected_columns: df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
if col not in df.columns: df['hour'] = df['trans_date_trans_time'].dt.hour
df[col] = 0 # Default value if missing df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df['month'] = df['trans_date_trans_time'].dt.month
return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
# Age calculation
if 'dob' in df.columns:
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (pd.Timestamp.now() - df['dob']).dt.days / 365.25
# Geospatial features
if all(col in df.columns for col in ['lat', 'long', 'merch_lat', 'merch_long']):
df['distance'] = np.sqrt(
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
(df['long'].astype(float) - df['merch_long'].astype(float))**2
)
# Ensure all expected features exist
for feature in self.required_features:
if feature not in df.columns:
df[feature] = 0 # Safe default
logger.warning(f"Missing feature filled with default: {feature}")
return df.drop(FEATURE_CONFIG['drop_columns'], axis=1, errors='ignore')
def predict(self, input_data: Dict[str, Any]) -> PredictionResult:
"""Make fraud prediction with probabilities"""
try:
processed_data = self.preprocess_input(input_data)
probabilities = self.model.predict_proba(processed_data)
fraud_prob = probabilities[0][1] * 100
prediction = self.model.predict(processed_data)[0]
confidence = "high" if fraud_prob > 75 else "medium" if fraud_prob > 25 else "low"
return PredictionResult(
is_fraud=bool(prediction),
probability=round(fraud_prob, 2),
confidence=confidence
)
except Exception as e:
logger.error(f"Prediction failed: {str(e)}")
raise
# Initialize service
try:
fraud_service = FraudDetectionService(MODEL_PATH)
except Exception as e:
logger.critical(f"Service initialization failed: {str(e)}")
raise
@app.route('/') @app.route('/')
def home(): def home():
@@ -55,36 +108,27 @@ def home():
@app.route('/predict', methods=['POST']) @app.route('/predict', methods=['POST'])
def predict(): def predict():
try: try:
# Get data from form input_data = request.form.to_dict()
data = request.form.to_dict() logger.info(f"Received prediction request: {input_data}")
print("Received data:", data) # Debugging
# Preprocess the input result = fraud_service.predict(input_data)
processed_data = preprocess_input(data)
print("Processed data:", processed_data) # Debugging
# Get prediction probabilities response = {
probabilities = model.predict_proba(processed_data) "is_fraud": result.is_fraud,
print("Raw probabilities:", probabilities) # Debugging "probability": result.probability,
"confidence": result.confidence,
# Extract fraud probability (class 1) "status": "success"
fraud_probability = probabilities[0][1] * 100 # Convert to percentage
# Make prediction
prediction = model.predict(processed_data)[0]
result = {
'prediction': bool(prediction),
'probability': float(fraud_probability),
'is_fraud': bool(prediction)
} }
print("Prediction result:", result) # Debugging return render_template('index.html', prediction=response)
return render_template('index.html', prediction=result)
except Exception as e: except Exception as e:
print("Prediction error:", str(e)) # Debugging logger.error(f"Prediction error: {str(e)}")
return f"Error: {str(e)}", 400 return jsonify({
"status": "error",
"message": "Failed to process prediction",
"error": str(e)
}), 400
if __name__ == '__main__': if __name__ == '__main__':
app.run(debug=True) app.run(host='0.0.0.0', port=5000, debug=True)
View File
+118 -60
View File
@@ -1,64 +1,122 @@
import os import os
from pathlib import Path from pathlib import Path
from typing import Dict, Any, List
from sklearn.ensemble import RandomForestClassifier
# Directory Paths class Config:
BASE_DIR = Path(__file__).parent.parent """Centralized configuration for the fraud detection system"""
DATA_DIR = BASE_DIR / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw' # ========== Directory Structure ==========
PROCESSED_DATA_DIR = DATA_DIR / 'processed' BASE_DIR = Path(__file__).parent.parent
MODELS_DIR = BASE_DIR / 'models' DATA_DIR = BASE_DIR / 'data'
REPORTS_DIR = BASE_DIR / 'reports' RAW_DATA_DIR = DATA_DIR / 'raw'
FIGURES_DIR = REPORTS_DIR / 'figures' PROCESSED_DATA_DIR = DATA_DIR / 'processed'
MODELS_DIR = BASE_DIR / 'models'
REPORTS_DIR = BASE_DIR / 'reports'
FIGURES_DIR = REPORTS_DIR / 'figures'
LOGS_DIR = BASE_DIR / 'logs'
# ========== File Paths ==========
TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
ERROR_LOG_PATH = LOGS_DIR / 'errors.log'
# ========== Model Configuration ==========
MODEL_PARAMS: Dict[str, Any] = {
'classifier': RandomForestClassifier,
'classifier_params': {
'n_estimators': 100,
'max_depth': None,
'min_samples_split': 2,
'class_weight': 'balanced',
'random_state': 42,
'n_jobs': -1
}
}
# ========== Feature Engineering ==========
NUMERIC_FEATURES: List[str] = [
'amt', 'city_pop', 'hour',
'day_of_week', 'month', 'age',
'distance'
]
CATEGORICAL_FEATURES: List[str] = [
'category', 'gender', 'job', 'merchant'
]
FEATURE_CONFIG: Dict[str, Any] = {
'numeric_features': NUMERIC_FEATURES,
'categorical_features': CATEGORICAL_FEATURES,
'time_features': ['hour', 'day_of_week', 'month'],
'demographic_features': ['age'],
'geographic_features': ['distance'],
'behavioral_features': ['txn_count_24h', 'time_since_last_txn'],
'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'],
'required_features': NUMERIC_FEATURES + CATEGORICAL_FEATURES
}
# ========== Prediction Configuration ==========
PREDICTION_THRESHOLDS: Dict[str, float] = {
'high_risk': 0.75,
'medium_risk': 0.25,
'low_risk': 0.01
}
# ========== Flask Configuration ==========
class FlaskConfig:
DEBUG = os.getenv('FLASK_DEBUG', 'True') == 'True'
SECRET_KEY = os.getenv('FLASK_SECRET_KEY', 'your-secret-key-here')
MAX_CONTENT_LENGTH = int(os.getenv('MAX_UPLOAD_SIZE', 16 * 1024 * 1024)) # 16MB default
JSONIFY_PRETTYPRINT_REGULAR = True
SERVER_NAME = os.getenv('FLASK_SERVER_NAME', None)
# ========== Logging Configuration ==========
LOGGING_CONFIG: Dict[str, Any] = {
'version': 1,
'formatters': {
'default': {
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
}
},
'handlers': {
'file': {
'class': 'logging.FileHandler',
'filename': ERROR_LOG_PATH,
'formatter': 'default'
},
'console': {
'class': 'logging.StreamHandler',
'formatter': 'default'
}
},
'root': {
'level': os.getenv('LOG_LEVEL', 'INFO'),
'handlers': ['file', 'console']
}
}
# ========== Constants ==========
DATE_FORMAT: str = '%Y-%m-%d %H:%M:%S'
RANDOM_STATE: int = 42
TEST_SIZE: float = 0.2
@classmethod
def init_directories(cls):
"""Ensure all required directories exist"""
required_dirs = [
cls.RAW_DATA_DIR,
cls.PROCESSED_DATA_DIR,
cls.MODELS_DIR,
cls.FIGURES_DIR,
cls.LOGS_DIR
]
for directory in required_dirs:
directory.mkdir(parents=True, exist_ok=True)
# File Paths # Initialize directories on import
TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv' Config.init_directories()
TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
# Model Configuration
MODEL_PARAMS = {
'classifier': RandomForestClassifier,
'classifier_params': {
'n_estimators': 100,
'max_depth': None,
'min_samples_split': 2,
'class_weight': 'balanced',
'random_state': 42,
'n_jobs': -1
},
'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'],
'categorical_features': ['category', 'gender', 'job', 'merchant']
}
# Feature Engineering Configuration
FEATURE_CONFIG = {
'time_features': ['hour', 'day_of_week', 'month'],
'demographic_features': ['age'],
'geographic_features': ['distance'],
'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time']
}
# Evaluation Metrics
EVAL_METRICS = {
'threshold': 0.5,
'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'],
'target_names': ['Legitimate', 'Fraud']
}
# Flask App Configuration
class FlaskConfig:
DEBUG = True
SECRET_KEY = 'your-secret-key-here'
MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB upload limit
JSONIFY_PRETTYPRINT_REGULAR = True
# Create directories if they don't exist
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]:
directory.mkdir(parents=True, exist_ok=True)
# Constants
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
RANDOM_STATE = 42
TEST_SIZE = 0.2
+5 -2
View File
@@ -1,14 +1,17 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from datetime import datetime from datetime import datetime
from config import TRAIN_DATA_PATH, TEST_DATA_PATH
# Load data with proper dtype specification # Load data with proper dtype specification
def load_data(filepath): def load_data(filepath):
return pd.read_csv(filepath, low_memory=False) return pd.read_csv(filepath, low_memory=False)
train_df = load_data('data/raw/fraudTrain.csv')
test_df = load_data('data/raw/fraudTest.csv') train_df = load_data(TRAIN_DATA_PATH)
test_df = load_data(TEST_DATA_PATH)
# Data cleaning function # Data cleaning function
def clean_data(df): def clean_data(df):
+11
View File
@@ -0,0 +1,11 @@
# exceptions.py
class FraudDetectionError(Exception):
"""Base exception class"""
class DataValidationError(FraudDetectionError):
"""Raised when data fails validation"""
class ModelServeError(FraudDetectionError):
"""API prediction failures"""
+2 -24
View File
@@ -7,6 +7,7 @@ from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
import joblib import joblib
from datetime import datetime from datetime import datetime
from config import TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH
def load_data(train_file, test_file): def load_data(train_file, test_file):
@@ -20,25 +21,6 @@ def load_data(train_file, test_file):
return train_df, test_df return train_df, test_df
def feature_engineering(df):
# Convert transaction time to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
# Extract time features
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df['month'] = df['trans_date_trans_time'].dt.month
# Calculate age from dob
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
# Calculate distance between user and merchant
df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
return df
def train_model(train_df, test_df): def train_model(train_df, test_df):
# Define features and target # Define features and target
X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
@@ -79,11 +61,7 @@ def train_model(train_df, test_df):
def main(): def main():
# Load data # Load data
train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv') train_df, test_df = load_data(TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH)
# Feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)
# Print dataset sizes after cleaning # Print dataset sizes after cleaning
print(f"Training samples after cleaning: {len(train_df)}") print(f"Training samples after cleaning: {len(train_df)}")
View File