update for model_training script

2025-07-29 10:27:21 +01:00
parent 6c1224eaca
commit 5c5ad60563
7 changed files with 246 additions and 152 deletions
@@ -1,52 +1,105 @@
-from flask import Flask, render_template, request
+from flask import Flask, render_template, request, jsonify
 import joblib
 import pandas as pd
 import numpy as np
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Any, Tuple, Union
 from dataclasses import dataclass
 import logging
 from config import MODEL_PATH, FEATURE_CONFIG
 # Initialize logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = Flask(__name__)
-# Load the model
+@dataclass
-try:
+class PredictionResult:
-    model = joblib.load('models/fraud_model.pkl')
+    is_fraud: bool
-    print("Model loaded successfully")
+    probability: float
-except Exception as e:
+    confidence: str  # "high", "medium", "low"
    print(f"Error loading model: {e}")
    raise
-def preprocess_input(data):
+class FraudDetectionService:
-    # Convert to DataFrame
+    def __init__(self, model_path: Path):
-    df = pd.DataFrame([data])
+        self.model = self._load_model(model_path)
-    
+        self.required_features = FEATURE_CONFIG['numeric_features'] + FEATURE_CONFIG['categorical_features']
-    # Ensure numeric fields are properly converted
+        
-    numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
+    def _load_model(self, model_path: Path) -> Any:
-    for field in numeric_fields:
+        """Load trained model with error handling"""
-        df[field] = pd.to_numeric(df[field], errors='coerce')
+        try:
-    
+            model = joblib.load(model_path)
-    # Convert and extract datetime features
+            if not hasattr(model, 'predict'):
-    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+                raise ValueError("Invalid model object - missing predict method")
-    df['hour'] = df['trans_date_trans_time'].dt.hour
+            logger.info("Model loaded successfully")
-    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
+            return model
-    df['month'] = df['trans_date_trans_time'].dt.month
+        except Exception as e:
-    
+            logger.error(f"Model loading failed: {str(e)}")
-    # Calculate age from dob
+            raise
-    df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9)
+
-    df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600)
+    def preprocess_input(self, input_data: Dict[str, Any]) -> pd.DataFrame:
-    
+        """Convert and validate input data"""
-    # Calculate distance safely
+        df = pd.DataFrame([input_data])
-    df['distance'] = np.sqrt(
+        
-        (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + 
+        # Type conversion
-        (df['long'].astype(float) - df['merch_long'].astype(float))**2
+        for field in FEATURE_CONFIG['numeric_features']:
-    )
+            if field in df.columns:
-    
+                df[field] = pd.to_numeric(df[field], errors='coerce')
-    # Ensure all expected columns are present
+        
-    expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance',
+        # Handle datetime features
-                       'category', 'gender', 'job', 'merchant']
+        if 'trans_date_trans_time' in df.columns:
-    for col in expected_columns:
+            df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
-        if col not in df.columns:
+            df['hour'] = df['trans_date_trans_time'].dt.hour
-            df[col] = 0  # Default value if missing
+            df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
-    
+            df['month'] = df['trans_date_trans_time'].dt.month
-    return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
+        
        # Age calculation
        if 'dob' in df.columns:
            df['dob'] = pd.to_datetime(df['dob'])
            df['age'] = (pd.Timestamp.now() - df['dob']).dt.days / 365.25
        # Geospatial features
        if all(col in df.columns for col in ['lat', 'long', 'merch_lat', 'merch_long']):
            df['distance'] = np.sqrt(
                (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + 
                (df['long'].astype(float) - df['merch_long'].astype(float))**2
            )
        # Ensure all expected features exist
        for feature in self.required_features:
            if feature not in df.columns:
                df[feature] = 0  # Safe default
                logger.warning(f"Missing feature filled with default: {feature}")
        return df.drop(FEATURE_CONFIG['drop_columns'], axis=1, errors='ignore')
    def predict(self, input_data: Dict[str, Any]) -> PredictionResult:
        """Make fraud prediction with probabilities"""
        try:
            processed_data = self.preprocess_input(input_data)
            probabilities = self.model.predict_proba(processed_data)
            fraud_prob = probabilities[0][1] * 100
            prediction = self.model.predict(processed_data)[0]
            confidence = "high" if fraud_prob > 75 else "medium" if fraud_prob > 25 else "low"
            return PredictionResult(
                is_fraud=bool(prediction),
                probability=round(fraud_prob, 2),
                confidence=confidence
            )
        except Exception as e:
            logger.error(f"Prediction failed: {str(e)}")
            raise
 # Initialize service
 try:
    fraud_service = FraudDetectionService(MODEL_PATH)
 except Exception as e:
    logger.critical(f"Service initialization failed: {str(e)}")
    raise
@app.route('/')
 def home():
@@ -55,36 +108,27 @@ def home():
@app.route('/predict', methods=['POST'])
 def predict():
    try:
-        # Get data from form
+        input_data = request.form.to_dict()
-        data = request.form.to_dict()
+        logger.info(f"Received prediction request: {input_data}")
        print("Received data:", data)  # Debugging
-        # Preprocess the input
+        result = fraud_service.predict(input_data)
        processed_data = preprocess_input(data)
        print("Processed data:", processed_data)  # Debugging
-        # Get prediction probabilities
+        response = {
-        probabilities = model.predict_proba(processed_data)
+            "is_fraud": result.is_fraud,
-        print("Raw probabilities:", probabilities)  # Debugging
+            "probability": result.probability,
-        
+            "confidence": result.confidence,
-        # Extract fraud probability (class 1)
+            "status": "success"
        fraud_probability = probabilities[0][1] * 100  # Convert to percentage
        # Make prediction
        prediction = model.predict(processed_data)[0]
        result = {
            'prediction': bool(prediction),
            'probability': float(fraud_probability),
            'is_fraud': bool(prediction)
        }
-        print("Prediction result:", result)  # Debugging
+        return render_template('index.html', prediction=response)
        return render_template('index.html', prediction=result)
    except Exception as e:
-        print("Prediction error:", str(e))  # Debugging
+        logger.error(f"Prediction error: {str(e)}")
-        return f"Error: {str(e)}", 400
+        return jsonify({
            "status": "error",
            "message": "Failed to process prediction",
            "error": str(e)
        }), 400
 if __name__ == '__main__':
-    app.run(debug=True)
+    app.run(host='0.0.0.0', port=5000, debug=True)
@@ -1,64 +1,122 @@
 import os
 from pathlib import Path
 from typing import Dict, Any, List
 from sklearn.ensemble import RandomForestClassifier
-# Directory Paths
+class Config:
-BASE_DIR = Path(__file__).parent.parent
+    """Centralized configuration for the fraud detection system"""
-DATA_DIR = BASE_DIR / 'data'
+    
-RAW_DATA_DIR = DATA_DIR / 'raw'
+    # ========== Directory Structure ==========
-PROCESSED_DATA_DIR = DATA_DIR / 'processed'
+    BASE_DIR = Path(__file__).parent.parent
-MODELS_DIR = BASE_DIR / 'models'
+    DATA_DIR = BASE_DIR / 'data'
-REPORTS_DIR = BASE_DIR / 'reports'
+    RAW_DATA_DIR = DATA_DIR / 'raw'
-FIGURES_DIR = REPORTS_DIR / 'figures'
+    PROCESSED_DATA_DIR = DATA_DIR / 'processed'
    MODELS_DIR = BASE_DIR / 'models'
    REPORTS_DIR = BASE_DIR / 'reports'
    FIGURES_DIR = REPORTS_DIR / 'figures'
    LOGS_DIR = BASE_DIR / 'logs'
    # ========== File Paths ==========
    TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
    TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
    TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
    TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
    MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
    ERROR_LOG_PATH = LOGS_DIR / 'errors.log'
    # ========== Model Configuration ==========
    MODEL_PARAMS: Dict[str, Any] = {
        'classifier': RandomForestClassifier,
        'classifier_params': {
            'n_estimators': 100,
            'max_depth': None,
            'min_samples_split': 2,
            'class_weight': 'balanced',
            'random_state': 42,
            'n_jobs': -1
        }
    }
    # ========== Feature Engineering ==========
    NUMERIC_FEATURES: List[str] = [
        'amt', 'city_pop', 'hour', 
        'day_of_week', 'month', 'age', 
        'distance'
    ]
    CATEGORICAL_FEATURES: List[str] = [
        'category', 'gender', 'job', 'merchant'
    ]
    FEATURE_CONFIG: Dict[str, Any] = {
        'numeric_features': NUMERIC_FEATURES,
        'categorical_features': CATEGORICAL_FEATURES,
        'time_features': ['hour', 'day_of_week', 'month'],
        'demographic_features': ['age'],
        'geographic_features': ['distance'],
        'behavioral_features': ['txn_count_24h', 'time_since_last_txn'],
        'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'],
        'required_features': NUMERIC_FEATURES + CATEGORICAL_FEATURES
    }
    # ========== Prediction Configuration ==========
    PREDICTION_THRESHOLDS: Dict[str, float] = {
        'high_risk': 0.75,
        'medium_risk': 0.25,
        'low_risk': 0.01
    }
    # ========== Flask Configuration ==========
    class FlaskConfig:
        DEBUG = os.getenv('FLASK_DEBUG', 'True') == 'True'
        SECRET_KEY = os.getenv('FLASK_SECRET_KEY', 'your-secret-key-here')
        MAX_CONTENT_LENGTH = int(os.getenv('MAX_UPLOAD_SIZE', 16 * 1024 * 1024))  # 16MB default
        JSONIFY_PRETTYPRINT_REGULAR = True
        SERVER_NAME = os.getenv('FLASK_SERVER_NAME', None)
    # ========== Logging Configuration ==========
    LOGGING_CONFIG: Dict[str, Any] = {
        'version': 1,
        'formatters': {
            'default': {
                'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
            }
        },
        'handlers': {
            'file': {
                'class': 'logging.FileHandler',
                'filename': ERROR_LOG_PATH,
                'formatter': 'default'
            },
            'console': {
                'class': 'logging.StreamHandler',
                'formatter': 'default'
            }
        },
        'root': {
            'level': os.getenv('LOG_LEVEL', 'INFO'),
            'handlers': ['file', 'console']
        }
    }
    # ========== Constants ==========
    DATE_FORMAT: str = '%Y-%m-%d %H:%M:%S'
    RANDOM_STATE: int = 42
    TEST_SIZE: float = 0.2
    @classmethod
    def init_directories(cls):
        """Ensure all required directories exist"""
        required_dirs = [
            cls.RAW_DATA_DIR,
            cls.PROCESSED_DATA_DIR,
            cls.MODELS_DIR,
            cls.FIGURES_DIR,
            cls.LOGS_DIR
        ]
        for directory in required_dirs:
            directory.mkdir(parents=True, exist_ok=True)
-# File Paths
+# Initialize directories on import
-TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
+Config.init_directories()
 TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
 TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
 TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
 MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
 # Model Configuration
 MODEL_PARAMS = {
    'classifier': RandomForestClassifier,
    'classifier_params': {
        'n_estimators': 100,
        'max_depth': None,
        'min_samples_split': 2,
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    },
    'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'],
    'categorical_features': ['category', 'gender', 'job', 'merchant']
 }
 # Feature Engineering Configuration
 FEATURE_CONFIG = {
    'time_features': ['hour', 'day_of_week', 'month'],
    'demographic_features': ['age'],
    'geographic_features': ['distance'],
    'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time']
 }
 # Evaluation Metrics
 EVAL_METRICS = {
    'threshold': 0.5,
    'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'],
    'target_names': ['Legitimate', 'Fraud']
 }
 # Flask App Configuration
 class FlaskConfig:
    DEBUG = True
    SECRET_KEY = 'your-secret-key-here'
    MAX_CONTENT_LENGTH = 16 * 1024 * 1024  # 16MB upload limit
    JSONIFY_PRETTYPRINT_REGULAR = True
 # Create directories if they don't exist
 for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]:
    directory.mkdir(parents=True, exist_ok=True)
 # Constants
 DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
 RANDOM_STATE = 42
 TEST_SIZE = 0.2
@@ -1,14 +1,17 @@
 import pandas as pd
 import numpy as np
 from datetime import datetime
 from config import TRAIN_DATA_PATH, TEST_DATA_PATH
 # Load data with proper dtype specification
 def load_data(filepath):
    return pd.read_csv(filepath, low_memory=False)
-train_df = load_data('data/raw/fraudTrain.csv')
+
-test_df = load_data('data/raw/fraudTest.csv')
+train_df = load_data(TRAIN_DATA_PATH)
 test_df = load_data(TEST_DATA_PATH)
 # Data cleaning function
 def clean_data(df):
@@ -0,0 +1,11 @@
 # exceptions.py
 class FraudDetectionError(Exception):
    """Base exception class"""
 class DataValidationError(FraudDetectionError):
    """Raised when data fails validation"""
 class ModelServeError(FraudDetectionError):
    """API prediction failures"""
@@ -7,6 +7,7 @@ from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 import joblib
 from datetime import datetime
 from config import TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH
 def load_data(train_file, test_file):
@@ -20,25 +21,6 @@ def load_data(train_file, test_file):
    return train_df, test_df
 def feature_engineering(df):
    # Convert transaction time to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    # Extract time features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['month'] = df['trans_date_trans_time'].dt.month
    # Calculate age from dob
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
    # Calculate distance between user and merchant
    df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
    return df
 def train_model(train_df, test_df):
    # Define features and target
    X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
@@ -79,11 +61,7 @@ def train_model(train_df, test_df):
 def main():
    # Load data
-    train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
+    train_df, test_df = load_data(TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH)
    # Feature engineering
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)
    # Print dataset sizes after cleaning
    print(f"Training samples after cleaning: {len(train_df)}")