update for model_training script
This commit is contained in:
+110
-66
@@ -1,52 +1,105 @@
|
|||||||
from flask import Flask, render_template, request
|
from flask import Flask, render_template, request, jsonify
|
||||||
import joblib
|
import joblib
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Tuple, Union
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import logging
|
||||||
|
from config import MODEL_PATH, FEATURE_CONFIG
|
||||||
|
|
||||||
|
# Initialize logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
# Load the model
|
@dataclass
|
||||||
try:
|
class PredictionResult:
|
||||||
model = joblib.load('models/fraud_model.pkl')
|
is_fraud: bool
|
||||||
print("Model loaded successfully")
|
probability: float
|
||||||
except Exception as e:
|
confidence: str # "high", "medium", "low"
|
||||||
print(f"Error loading model: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def preprocess_input(data):
|
class FraudDetectionService:
|
||||||
# Convert to DataFrame
|
def __init__(self, model_path: Path):
|
||||||
df = pd.DataFrame([data])
|
self.model = self._load_model(model_path)
|
||||||
|
self.required_features = FEATURE_CONFIG['numeric_features'] + FEATURE_CONFIG['categorical_features']
|
||||||
# Ensure numeric fields are properly converted
|
|
||||||
numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
|
def _load_model(self, model_path: Path) -> Any:
|
||||||
for field in numeric_fields:
|
"""Load trained model with error handling"""
|
||||||
df[field] = pd.to_numeric(df[field], errors='coerce')
|
try:
|
||||||
|
model = joblib.load(model_path)
|
||||||
# Convert and extract datetime features
|
if not hasattr(model, 'predict'):
|
||||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
raise ValueError("Invalid model object - missing predict method")
|
||||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
logger.info("Model loaded successfully")
|
||||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
return model
|
||||||
df['month'] = df['trans_date_trans_time'].dt.month
|
except Exception as e:
|
||||||
|
logger.error(f"Model loading failed: {str(e)}")
|
||||||
# Calculate age from dob
|
raise
|
||||||
df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9)
|
|
||||||
df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600)
|
def preprocess_input(self, input_data: Dict[str, Any]) -> pd.DataFrame:
|
||||||
|
"""Convert and validate input data"""
|
||||||
# Calculate distance safely
|
df = pd.DataFrame([input_data])
|
||||||
df['distance'] = np.sqrt(
|
|
||||||
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
|
# Type conversion
|
||||||
(df['long'].astype(float) - df['merch_long'].astype(float))**2
|
for field in FEATURE_CONFIG['numeric_features']:
|
||||||
)
|
if field in df.columns:
|
||||||
|
df[field] = pd.to_numeric(df[field], errors='coerce')
|
||||||
# Ensure all expected columns are present
|
|
||||||
expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance',
|
# Handle datetime features
|
||||||
'category', 'gender', 'job', 'merchant']
|
if 'trans_date_trans_time' in df.columns:
|
||||||
for col in expected_columns:
|
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||||
if col not in df.columns:
|
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||||
df[col] = 0 # Default value if missing
|
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||||
|
df['month'] = df['trans_date_trans_time'].dt.month
|
||||||
return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
|
|
||||||
|
# Age calculation
|
||||||
|
if 'dob' in df.columns:
|
||||||
|
df['dob'] = pd.to_datetime(df['dob'])
|
||||||
|
df['age'] = (pd.Timestamp.now() - df['dob']).dt.days / 365.25
|
||||||
|
|
||||||
|
# Geospatial features
|
||||||
|
if all(col in df.columns for col in ['lat', 'long', 'merch_lat', 'merch_long']):
|
||||||
|
df['distance'] = np.sqrt(
|
||||||
|
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
|
||||||
|
(df['long'].astype(float) - df['merch_long'].astype(float))**2
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ensure all expected features exist
|
||||||
|
for feature in self.required_features:
|
||||||
|
if feature not in df.columns:
|
||||||
|
df[feature] = 0 # Safe default
|
||||||
|
logger.warning(f"Missing feature filled with default: {feature}")
|
||||||
|
|
||||||
|
return df.drop(FEATURE_CONFIG['drop_columns'], axis=1, errors='ignore')
|
||||||
|
|
||||||
|
def predict(self, input_data: Dict[str, Any]) -> PredictionResult:
|
||||||
|
"""Make fraud prediction with probabilities"""
|
||||||
|
try:
|
||||||
|
processed_data = self.preprocess_input(input_data)
|
||||||
|
probabilities = self.model.predict_proba(processed_data)
|
||||||
|
fraud_prob = probabilities[0][1] * 100
|
||||||
|
prediction = self.model.predict(processed_data)[0]
|
||||||
|
|
||||||
|
confidence = "high" if fraud_prob > 75 else "medium" if fraud_prob > 25 else "low"
|
||||||
|
|
||||||
|
return PredictionResult(
|
||||||
|
is_fraud=bool(prediction),
|
||||||
|
probability=round(fraud_prob, 2),
|
||||||
|
confidence=confidence
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Prediction failed: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Initialize service
|
||||||
|
try:
|
||||||
|
fraud_service = FraudDetectionService(MODEL_PATH)
|
||||||
|
except Exception as e:
|
||||||
|
logger.critical(f"Service initialization failed: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
def home():
|
def home():
|
||||||
@@ -55,36 +108,27 @@ def home():
|
|||||||
@app.route('/predict', methods=['POST'])
|
@app.route('/predict', methods=['POST'])
|
||||||
def predict():
|
def predict():
|
||||||
try:
|
try:
|
||||||
# Get data from form
|
input_data = request.form.to_dict()
|
||||||
data = request.form.to_dict()
|
logger.info(f"Received prediction request: {input_data}")
|
||||||
print("Received data:", data) # Debugging
|
|
||||||
|
|
||||||
# Preprocess the input
|
result = fraud_service.predict(input_data)
|
||||||
processed_data = preprocess_input(data)
|
|
||||||
print("Processed data:", processed_data) # Debugging
|
|
||||||
|
|
||||||
# Get prediction probabilities
|
response = {
|
||||||
probabilities = model.predict_proba(processed_data)
|
"is_fraud": result.is_fraud,
|
||||||
print("Raw probabilities:", probabilities) # Debugging
|
"probability": result.probability,
|
||||||
|
"confidence": result.confidence,
|
||||||
# Extract fraud probability (class 1)
|
"status": "success"
|
||||||
fraud_probability = probabilities[0][1] * 100 # Convert to percentage
|
|
||||||
|
|
||||||
# Make prediction
|
|
||||||
prediction = model.predict(processed_data)[0]
|
|
||||||
|
|
||||||
result = {
|
|
||||||
'prediction': bool(prediction),
|
|
||||||
'probability': float(fraud_probability),
|
|
||||||
'is_fraud': bool(prediction)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
print("Prediction result:", result) # Debugging
|
return render_template('index.html', prediction=response)
|
||||||
return render_template('index.html', prediction=result)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Prediction error:", str(e)) # Debugging
|
logger.error(f"Prediction error: {str(e)}")
|
||||||
return f"Error: {str(e)}", 400
|
return jsonify({
|
||||||
|
"status": "error",
|
||||||
|
"message": "Failed to process prediction",
|
||||||
|
"error": str(e)
|
||||||
|
}), 400
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(debug=True)
|
app.run(host='0.0.0.0', port=5000, debug=True)
|
||||||
+118
-60
@@ -1,64 +1,122 @@
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
|
||||||
# Directory Paths
|
class Config:
|
||||||
BASE_DIR = Path(__file__).parent.parent
|
"""Centralized configuration for the fraud detection system"""
|
||||||
DATA_DIR = BASE_DIR / 'data'
|
|
||||||
RAW_DATA_DIR = DATA_DIR / 'raw'
|
# ========== Directory Structure ==========
|
||||||
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
|
BASE_DIR = Path(__file__).parent.parent
|
||||||
MODELS_DIR = BASE_DIR / 'models'
|
DATA_DIR = BASE_DIR / 'data'
|
||||||
REPORTS_DIR = BASE_DIR / 'reports'
|
RAW_DATA_DIR = DATA_DIR / 'raw'
|
||||||
FIGURES_DIR = REPORTS_DIR / 'figures'
|
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
|
||||||
|
MODELS_DIR = BASE_DIR / 'models'
|
||||||
|
REPORTS_DIR = BASE_DIR / 'reports'
|
||||||
|
FIGURES_DIR = REPORTS_DIR / 'figures'
|
||||||
|
LOGS_DIR = BASE_DIR / 'logs'
|
||||||
|
|
||||||
|
# ========== File Paths ==========
|
||||||
|
TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
|
||||||
|
TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
|
||||||
|
TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
|
||||||
|
TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
|
||||||
|
MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
|
||||||
|
ERROR_LOG_PATH = LOGS_DIR / 'errors.log'
|
||||||
|
|
||||||
|
# ========== Model Configuration ==========
|
||||||
|
MODEL_PARAMS: Dict[str, Any] = {
|
||||||
|
'classifier': RandomForestClassifier,
|
||||||
|
'classifier_params': {
|
||||||
|
'n_estimators': 100,
|
||||||
|
'max_depth': None,
|
||||||
|
'min_samples_split': 2,
|
||||||
|
'class_weight': 'balanced',
|
||||||
|
'random_state': 42,
|
||||||
|
'n_jobs': -1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# ========== Feature Engineering ==========
|
||||||
|
NUMERIC_FEATURES: List[str] = [
|
||||||
|
'amt', 'city_pop', 'hour',
|
||||||
|
'day_of_week', 'month', 'age',
|
||||||
|
'distance'
|
||||||
|
]
|
||||||
|
|
||||||
|
CATEGORICAL_FEATURES: List[str] = [
|
||||||
|
'category', 'gender', 'job', 'merchant'
|
||||||
|
]
|
||||||
|
|
||||||
|
FEATURE_CONFIG: Dict[str, Any] = {
|
||||||
|
'numeric_features': NUMERIC_FEATURES,
|
||||||
|
'categorical_features': CATEGORICAL_FEATURES,
|
||||||
|
'time_features': ['hour', 'day_of_week', 'month'],
|
||||||
|
'demographic_features': ['age'],
|
||||||
|
'geographic_features': ['distance'],
|
||||||
|
'behavioral_features': ['txn_count_24h', 'time_since_last_txn'],
|
||||||
|
'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'],
|
||||||
|
'required_features': NUMERIC_FEATURES + CATEGORICAL_FEATURES
|
||||||
|
}
|
||||||
|
|
||||||
|
# ========== Prediction Configuration ==========
|
||||||
|
PREDICTION_THRESHOLDS: Dict[str, float] = {
|
||||||
|
'high_risk': 0.75,
|
||||||
|
'medium_risk': 0.25,
|
||||||
|
'low_risk': 0.01
|
||||||
|
}
|
||||||
|
|
||||||
|
# ========== Flask Configuration ==========
|
||||||
|
class FlaskConfig:
|
||||||
|
DEBUG = os.getenv('FLASK_DEBUG', 'True') == 'True'
|
||||||
|
SECRET_KEY = os.getenv('FLASK_SECRET_KEY', 'your-secret-key-here')
|
||||||
|
MAX_CONTENT_LENGTH = int(os.getenv('MAX_UPLOAD_SIZE', 16 * 1024 * 1024)) # 16MB default
|
||||||
|
JSONIFY_PRETTYPRINT_REGULAR = True
|
||||||
|
SERVER_NAME = os.getenv('FLASK_SERVER_NAME', None)
|
||||||
|
|
||||||
|
# ========== Logging Configuration ==========
|
||||||
|
LOGGING_CONFIG: Dict[str, Any] = {
|
||||||
|
'version': 1,
|
||||||
|
'formatters': {
|
||||||
|
'default': {
|
||||||
|
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'handlers': {
|
||||||
|
'file': {
|
||||||
|
'class': 'logging.FileHandler',
|
||||||
|
'filename': ERROR_LOG_PATH,
|
||||||
|
'formatter': 'default'
|
||||||
|
},
|
||||||
|
'console': {
|
||||||
|
'class': 'logging.StreamHandler',
|
||||||
|
'formatter': 'default'
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'root': {
|
||||||
|
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||||
|
'handlers': ['file', 'console']
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# ========== Constants ==========
|
||||||
|
DATE_FORMAT: str = '%Y-%m-%d %H:%M:%S'
|
||||||
|
RANDOM_STATE: int = 42
|
||||||
|
TEST_SIZE: float = 0.2
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def init_directories(cls):
|
||||||
|
"""Ensure all required directories exist"""
|
||||||
|
required_dirs = [
|
||||||
|
cls.RAW_DATA_DIR,
|
||||||
|
cls.PROCESSED_DATA_DIR,
|
||||||
|
cls.MODELS_DIR,
|
||||||
|
cls.FIGURES_DIR,
|
||||||
|
cls.LOGS_DIR
|
||||||
|
]
|
||||||
|
|
||||||
|
for directory in required_dirs:
|
||||||
|
directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# File Paths
|
# Initialize directories on import
|
||||||
TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
|
Config.init_directories()
|
||||||
TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
|
|
||||||
TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
|
|
||||||
TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
|
|
||||||
MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
|
|
||||||
|
|
||||||
# Model Configuration
|
|
||||||
MODEL_PARAMS = {
|
|
||||||
'classifier': RandomForestClassifier,
|
|
||||||
'classifier_params': {
|
|
||||||
'n_estimators': 100,
|
|
||||||
'max_depth': None,
|
|
||||||
'min_samples_split': 2,
|
|
||||||
'class_weight': 'balanced',
|
|
||||||
'random_state': 42,
|
|
||||||
'n_jobs': -1
|
|
||||||
},
|
|
||||||
'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'],
|
|
||||||
'categorical_features': ['category', 'gender', 'job', 'merchant']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Feature Engineering Configuration
|
|
||||||
FEATURE_CONFIG = {
|
|
||||||
'time_features': ['hour', 'day_of_week', 'month'],
|
|
||||||
'demographic_features': ['age'],
|
|
||||||
'geographic_features': ['distance'],
|
|
||||||
'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Evaluation Metrics
|
|
||||||
EVAL_METRICS = {
|
|
||||||
'threshold': 0.5,
|
|
||||||
'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'],
|
|
||||||
'target_names': ['Legitimate', 'Fraud']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Flask App Configuration
|
|
||||||
class FlaskConfig:
|
|
||||||
DEBUG = True
|
|
||||||
SECRET_KEY = 'your-secret-key-here'
|
|
||||||
MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB upload limit
|
|
||||||
JSONIFY_PRETTYPRINT_REGULAR = True
|
|
||||||
|
|
||||||
# Create directories if they don't exist
|
|
||||||
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]:
|
|
||||||
directory.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
|
|
||||||
RANDOM_STATE = 42
|
|
||||||
TEST_SIZE = 0.2
|
|
||||||
@@ -1,14 +1,17 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from config import TRAIN_DATA_PATH, TEST_DATA_PATH
|
||||||
|
|
||||||
|
|
||||||
# Load data with proper dtype specification
|
# Load data with proper dtype specification
|
||||||
def load_data(filepath):
|
def load_data(filepath):
|
||||||
return pd.read_csv(filepath, low_memory=False)
|
return pd.read_csv(filepath, low_memory=False)
|
||||||
|
|
||||||
train_df = load_data('data/raw/fraudTrain.csv')
|
|
||||||
test_df = load_data('data/raw/fraudTest.csv')
|
train_df = load_data(TRAIN_DATA_PATH)
|
||||||
|
test_df = load_data(TEST_DATA_PATH)
|
||||||
|
|
||||||
|
|
||||||
# Data cleaning function
|
# Data cleaning function
|
||||||
def clean_data(df):
|
def clean_data(df):
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
# exceptions.py
|
||||||
|
class FraudDetectionError(Exception):
|
||||||
|
"""Base exception class"""
|
||||||
|
|
||||||
|
|
||||||
|
class DataValidationError(FraudDetectionError):
|
||||||
|
"""Raised when data fails validation"""
|
||||||
|
|
||||||
|
|
||||||
|
class ModelServeError(FraudDetectionError):
|
||||||
|
"""API prediction failures"""
|
||||||
+2
-24
@@ -7,6 +7,7 @@ from sklearn.compose import ColumnTransformer
|
|||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
import joblib
|
import joblib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from config import TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH
|
||||||
|
|
||||||
|
|
||||||
def load_data(train_file, test_file):
|
def load_data(train_file, test_file):
|
||||||
@@ -20,25 +21,6 @@ def load_data(train_file, test_file):
|
|||||||
return train_df, test_df
|
return train_df, test_df
|
||||||
|
|
||||||
|
|
||||||
def feature_engineering(df):
|
|
||||||
# Convert transaction time to datetime
|
|
||||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
|
||||||
|
|
||||||
# Extract time features
|
|
||||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
|
||||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
|
||||||
df['month'] = df['trans_date_trans_time'].dt.month
|
|
||||||
|
|
||||||
# Calculate age from dob
|
|
||||||
df['dob'] = pd.to_datetime(df['dob'])
|
|
||||||
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
|
||||||
|
|
||||||
# Calculate distance between user and merchant
|
|
||||||
df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def train_model(train_df, test_df):
|
def train_model(train_df, test_df):
|
||||||
# Define features and target
|
# Define features and target
|
||||||
X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
||||||
@@ -79,11 +61,7 @@ def train_model(train_df, test_df):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Load data
|
# Load data
|
||||||
train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
|
train_df, test_df = load_data(TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH)
|
||||||
|
|
||||||
# Feature engineering
|
|
||||||
train_df = feature_engineering(train_df)
|
|
||||||
test_df = feature_engineering(test_df)
|
|
||||||
|
|
||||||
# Print dataset sizes after cleaning
|
# Print dataset sizes after cleaning
|
||||||
print(f"Training samples after cleaning: {len(train_df)}")
|
print(f"Training samples after cleaning: {len(train_df)}")
|
||||||
|
|||||||
Reference in New Issue
Block a user