update for model_training script

2025-07-29 10:27:21 +01:00
parent 6c1224eaca
commit 5c5ad60563
7 changed files with 246 additions and 152 deletions
@@ -1,52 +1,105 @@
-from flask import Flask, render_template, request
+from flask import Flask, render_template, request, jsonify
 import joblib
 import pandas as pd
 import numpy as np
 from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Tuple, Union
+from dataclasses import dataclass
+import logging
+from config import MODEL_PATH, FEATURE_CONFIG
+
+# Initialize logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)

 app = Flask(__name__)

-# Load the model
-try:
-    model = joblib.load('models/fraud_model.pkl')
-    print("Model loaded successfully")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    raise
+@dataclass
+class PredictionResult:
+    is_fraud: bool
+    probability: float
+    confidence: str  # "high", "medium", "low"

-def preprocess_input(data):
-    # Convert to DataFrame
-    df = pd.DataFrame([data])
-    
-    # Ensure numeric fields are properly converted
-    numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
-    for field in numeric_fields:
-        df[field] = pd.to_numeric(df[field], errors='coerce')
-    
-    # Convert and extract datetime features
-    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
-    df['hour'] = df['trans_date_trans_time'].dt.hour
-    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
-    df['month'] = df['trans_date_trans_time'].dt.month
-    
-    # Calculate age from dob
-    df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9)
-    df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600)
-    
-    # Calculate distance safely
-    df['distance'] = np.sqrt(
-        (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + 
-        (df['long'].astype(float) - df['merch_long'].astype(float))**2
-    )
-    
-    # Ensure all expected columns are present
-    expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance',
-                       'category', 'gender', 'job', 'merchant']
-    for col in expected_columns:
-        if col not in df.columns:
-            df[col] = 0  # Default value if missing
-    
-    return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
+class FraudDetectionService:
+    def __init__(self, model_path: Path):
+        self.model = self._load_model(model_path)
+        self.required_features = FEATURE_CONFIG['numeric_features'] + FEATURE_CONFIG['categorical_features']
+        
+    def _load_model(self, model_path: Path) -> Any:
+        """Load trained model with error handling"""
+        try:
+            model = joblib.load(model_path)
+            if not hasattr(model, 'predict'):
+                raise ValueError("Invalid model object - missing predict method")
+            logger.info("Model loaded successfully")
+            return model
+        except Exception as e:
+            logger.error(f"Model loading failed: {str(e)}")
+            raise
+
+    def preprocess_input(self, input_data: Dict[str, Any]) -> pd.DataFrame:
+        """Convert and validate input data"""
+        df = pd.DataFrame([input_data])
+        
+        # Type conversion
+        for field in FEATURE_CONFIG['numeric_features']:
+            if field in df.columns:
+                df[field] = pd.to_numeric(df[field], errors='coerce')
+        
+        # Handle datetime features
+        if 'trans_date_trans_time' in df.columns:
+            df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+            df['hour'] = df['trans_date_trans_time'].dt.hour
+            df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
+            df['month'] = df['trans_date_trans_time'].dt.month
+        
+        # Age calculation
+        if 'dob' in df.columns:
+            df['dob'] = pd.to_datetime(df['dob'])
+            df['age'] = (pd.Timestamp.now() - df['dob']).dt.days / 365.25
+        
+        # Geospatial features
+        if all(col in df.columns for col in ['lat', 'long', 'merch_lat', 'merch_long']):
+            df['distance'] = np.sqrt(
+                (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + 
+                (df['long'].astype(float) - df['merch_long'].astype(float))**2
+            )
+        
+        # Ensure all expected features exist
+        for feature in self.required_features:
+            if feature not in df.columns:
+                df[feature] = 0  # Safe default
+                logger.warning(f"Missing feature filled with default: {feature}")
+        
+        return df.drop(FEATURE_CONFIG['drop_columns'], axis=1, errors='ignore')
+
+    def predict(self, input_data: Dict[str, Any]) -> PredictionResult:
+        """Make fraud prediction with probabilities"""
+        try:
+            processed_data = self.preprocess_input(input_data)
+            probabilities = self.model.predict_proba(processed_data)
+            fraud_prob = probabilities[0][1] * 100
+            prediction = self.model.predict(processed_data)[0]
+            
+            confidence = "high" if fraud_prob > 75 else "medium" if fraud_prob > 25 else "low"
+            
+            return PredictionResult(
+                is_fraud=bool(prediction),
+                probability=round(fraud_prob, 2),
+                confidence=confidence
+            )
+            
+        except Exception as e:
+            logger.error(f"Prediction failed: {str(e)}")
+            raise
+
+# Initialize service
+try:
+    fraud_service = FraudDetectionService(MODEL_PATH)
+except Exception as e:
+    logger.critical(f"Service initialization failed: {str(e)}")
+    raise

@app.route('/')
 def home():
@@ -55,36 +108,27 @@ def home():
@app.route('/predict', methods=['POST'])
 def predict():
    try:
-        # Get data from form
-        data = request.form.to_dict()
-        print("Received data:", data)  # Debugging
+        input_data = request.form.to_dict()
+        logger.info(f"Received prediction request: {input_data}")
        
-        # Preprocess the input
-        processed_data = preprocess_input(data)
-        print("Processed data:", processed_data)  # Debugging
+        result = fraud_service.predict(input_data)
        
-        # Get prediction probabilities
-        probabilities = model.predict_proba(processed_data)
-        print("Raw probabilities:", probabilities)  # Debugging
-        
-        # Extract fraud probability (class 1)
-        fraud_probability = probabilities[0][1] * 100  # Convert to percentage
-        
-        # Make prediction
-        prediction = model.predict(processed_data)[0]
-        
-        result = {
-            'prediction': bool(prediction),
-            'probability': float(fraud_probability),
-            'is_fraud': bool(prediction)
+        response = {
+            "is_fraud": result.is_fraud,
+            "probability": result.probability,
+            "confidence": result.confidence,
+            "status": "success"
        }
        
-        print("Prediction result:", result)  # Debugging
-        return render_template('index.html', prediction=result)
+        return render_template('index.html', prediction=response)
    
    except Exception as e:
-        print("Prediction error:", str(e))  # Debugging
-        return f"Error: {str(e)}", 400
+        logger.error(f"Prediction error: {str(e)}")
+        return jsonify({
+            "status": "error",
+            "message": "Failed to process prediction",
+            "error": str(e)
+        }), 400

 if __name__ == '__main__':
-    app.run(debug=True)
+    app.run(host='0.0.0.0', port=5000, debug=True)
@@ -1,64 +1,122 @@
 import os
 from pathlib import Path
+from typing import Dict, Any, List
+from sklearn.ensemble import RandomForestClassifier

-# Directory Paths
-BASE_DIR = Path(__file__).parent.parent
-DATA_DIR = BASE_DIR / 'data'
-RAW_DATA_DIR = DATA_DIR / 'raw'
-PROCESSED_DATA_DIR = DATA_DIR / 'processed'
-MODELS_DIR = BASE_DIR / 'models'
-REPORTS_DIR = BASE_DIR / 'reports'
-FIGURES_DIR = REPORTS_DIR / 'figures'
+class Config:
+    """Centralized configuration for the fraud detection system"""
+    
+    # ========== Directory Structure ==========
+    BASE_DIR = Path(__file__).parent.parent
+    DATA_DIR = BASE_DIR / 'data'
+    RAW_DATA_DIR = DATA_DIR / 'raw'
+    PROCESSED_DATA_DIR = DATA_DIR / 'processed'
+    MODELS_DIR = BASE_DIR / 'models'
+    REPORTS_DIR = BASE_DIR / 'reports'
+    FIGURES_DIR = REPORTS_DIR / 'figures'
+    LOGS_DIR = BASE_DIR / 'logs'
+    
+    # ========== File Paths ==========
+    TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
+    TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
+    TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
+    TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
+    MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
+    ERROR_LOG_PATH = LOGS_DIR / 'errors.log'
+    
+    # ========== Model Configuration ==========
+    MODEL_PARAMS: Dict[str, Any] = {
+        'classifier': RandomForestClassifier,
+        'classifier_params': {
+            'n_estimators': 100,
+            'max_depth': None,
+            'min_samples_split': 2,
+            'class_weight': 'balanced',
+            'random_state': 42,
+            'n_jobs': -1
+        }
+    }
+    
+    # ========== Feature Engineering ==========
+    NUMERIC_FEATURES: List[str] = [
+        'amt', 'city_pop', 'hour', 
+        'day_of_week', 'month', 'age', 
+        'distance'
+    ]
+    
+    CATEGORICAL_FEATURES: List[str] = [
+        'category', 'gender', 'job', 'merchant'
+    ]
+    
+    FEATURE_CONFIG: Dict[str, Any] = {
+        'numeric_features': NUMERIC_FEATURES,
+        'categorical_features': CATEGORICAL_FEATURES,
+        'time_features': ['hour', 'day_of_week', 'month'],
+        'demographic_features': ['age'],
+        'geographic_features': ['distance'],
+        'behavioral_features': ['txn_count_24h', 'time_since_last_txn'],
+        'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'],
+        'required_features': NUMERIC_FEATURES + CATEGORICAL_FEATURES
+    }
+    
+    # ========== Prediction Configuration ==========
+    PREDICTION_THRESHOLDS: Dict[str, float] = {
+        'high_risk': 0.75,
+        'medium_risk': 0.25,
+        'low_risk': 0.01
+    }
+    
+    # ========== Flask Configuration ==========
+    class FlaskConfig:
+        DEBUG = os.getenv('FLASK_DEBUG', 'True') == 'True'
+        SECRET_KEY = os.getenv('FLASK_SECRET_KEY', 'your-secret-key-here')
+        MAX_CONTENT_LENGTH = int(os.getenv('MAX_UPLOAD_SIZE', 16 * 1024 * 1024))  # 16MB default
+        JSONIFY_PRETTYPRINT_REGULAR = True
+        SERVER_NAME = os.getenv('FLASK_SERVER_NAME', None)
+        
+    # ========== Logging Configuration ==========
+    LOGGING_CONFIG: Dict[str, Any] = {
+        'version': 1,
+        'formatters': {
+            'default': {
+                'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
+            }
+        },
+        'handlers': {
+            'file': {
+                'class': 'logging.FileHandler',
+                'filename': ERROR_LOG_PATH,
+                'formatter': 'default'
+            },
+            'console': {
+                'class': 'logging.StreamHandler',
+                'formatter': 'default'
+            }
+        },
+        'root': {
+            'level': os.getenv('LOG_LEVEL', 'INFO'),
+            'handlers': ['file', 'console']
+        }
+    }
+    
+    # ========== Constants ==========
+    DATE_FORMAT: str = '%Y-%m-%d %H:%M:%S'
+    RANDOM_STATE: int = 42
+    TEST_SIZE: float = 0.2
+    
+    @classmethod
+    def init_directories(cls):
+        """Ensure all required directories exist"""
+        required_dirs = [
+            cls.RAW_DATA_DIR,
+            cls.PROCESSED_DATA_DIR,
+            cls.MODELS_DIR,
+            cls.FIGURES_DIR,
+            cls.LOGS_DIR
+        ]
+        
+        for directory in required_dirs:
+            directory.mkdir(parents=True, exist_ok=True)

-# File Paths
-TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
-TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
-TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
-TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
-MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
-
-# Model Configuration
-MODEL_PARAMS = {
-    'classifier': RandomForestClassifier,
-    'classifier_params': {
-        'n_estimators': 100,
-        'max_depth': None,
-        'min_samples_split': 2,
-        'class_weight': 'balanced',
-        'random_state': 42,
-        'n_jobs': -1
-    },
-    'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'],
-    'categorical_features': ['category', 'gender', 'job', 'merchant']
-}
-
-# Feature Engineering Configuration
-FEATURE_CONFIG = {
-    'time_features': ['hour', 'day_of_week', 'month'],
-    'demographic_features': ['age'],
-    'geographic_features': ['distance'],
-    'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time']
-}
-
-# Evaluation Metrics
-EVAL_METRICS = {
-    'threshold': 0.5,
-    'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'],
-    'target_names': ['Legitimate', 'Fraud']
-}
-
-# Flask App Configuration
-class FlaskConfig:
-    DEBUG = True
-    SECRET_KEY = 'your-secret-key-here'
-    MAX_CONTENT_LENGTH = 16 * 1024 * 1024  # 16MB upload limit
-    JSONIFY_PRETTYPRINT_REGULAR = True
-
-# Create directories if they don't exist
-for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]:
-    directory.mkdir(parents=True, exist_ok=True)
-
-# Constants
-DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
-RANDOM_STATE = 42
-TEST_SIZE = 0.2
+# Initialize directories on import
+Config.init_directories()
@@ -1,14 +1,17 @@
 import pandas as pd
 import numpy as np
 from datetime import datetime
+from config import TRAIN_DATA_PATH, TEST_DATA_PATH


 # Load data with proper dtype specification
 def load_data(filepath):
    return pd.read_csv(filepath, low_memory=False)

-train_df = load_data('data/raw/fraudTrain.csv')
-test_df = load_data('data/raw/fraudTest.csv')
+
+train_df = load_data(TRAIN_DATA_PATH)
+test_df = load_data(TEST_DATA_PATH)
+

 # Data cleaning function
 def clean_data(df):
@@ -0,0 +1,11 @@
+# exceptions.py
+class FraudDetectionError(Exception):
+    """Base exception class"""
+
+
+class DataValidationError(FraudDetectionError):
+    """Raised when data fails validation"""
+    
+
+class ModelServeError(FraudDetectionError):
+    """API prediction failures"""
@@ -7,6 +7,7 @@ from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 import joblib
 from datetime import datetime
+from config import TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH


 def load_data(train_file, test_file):
@@ -20,25 +21,6 @@ def load_data(train_file, test_file):
    return train_df, test_df


-def feature_engineering(df):
-    # Convert transaction time to datetime
-    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
-
-    # Extract time features
-    df['hour'] = df['trans_date_trans_time'].dt.hour
-    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
-    df['month'] = df['trans_date_trans_time'].dt.month
-
-    # Calculate age from dob
-    df['dob'] = pd.to_datetime(df['dob'])
-    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
-
-    # Calculate distance between user and merchant
-    df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
-
-    return df
-
-
 def train_model(train_df, test_df):
    # Define features and target
    X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
@@ -79,11 +61,7 @@ def train_model(train_df, test_df):

 def main():
    # Load data
-    train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
-
-    # Feature engineering
-    train_df = feature_engineering(train_df)
-    test_df = feature_engineering(test_df)
+    train_df, test_df = load_data(TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH)

    # Print dataset sizes after cleaning
    print(f"Training samples after cleaning: {len(train_df)}")