diff --git a/src/api/app.py b/src/api/app.py index 9f3584e..620f6fe 100644 --- a/src/api/app.py +++ b/src/api/app.py @@ -1,52 +1,105 @@ -from flask import Flask, render_template, request +from flask import Flask, render_template, request, jsonify import joblib import pandas as pd import numpy as np from datetime import datetime +from pathlib import Path +from typing import Dict, Any, Tuple, Union +from dataclasses import dataclass +import logging +from config import MODEL_PATH, FEATURE_CONFIG + +# Initialize logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) app = Flask(__name__) -# Load the model -try: - model = joblib.load('models/fraud_model.pkl') - print("Model loaded successfully") -except Exception as e: - print(f"Error loading model: {e}") - raise +@dataclass +class PredictionResult: + is_fraud: bool + probability: float + confidence: str # "high", "medium", "low" -def preprocess_input(data): - # Convert to DataFrame - df = pd.DataFrame([data]) - - # Ensure numeric fields are properly converted - numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long'] - for field in numeric_fields: - df[field] = pd.to_numeric(df[field], errors='coerce') - - # Convert and extract datetime features - df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) - df['hour'] = df['trans_date_trans_time'].dt.hour - df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek - df['month'] = df['trans_date_trans_time'].dt.month - - # Calculate age from dob - df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9) - df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600) - - # Calculate distance safely - df['distance'] = np.sqrt( - (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + - (df['long'].astype(float) - df['merch_long'].astype(float))**2 - ) - - # Ensure all expected columns are present - expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance', - 'category', 'gender', 'job', 'merchant'] - for col in expected_columns: - if col not in df.columns: - df[col] = 0 # Default value if missing - - return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore') +class FraudDetectionService: + def __init__(self, model_path: Path): + self.model = self._load_model(model_path) + self.required_features = FEATURE_CONFIG['numeric_features'] + FEATURE_CONFIG['categorical_features'] + + def _load_model(self, model_path: Path) -> Any: + """Load trained model with error handling""" + try: + model = joblib.load(model_path) + if not hasattr(model, 'predict'): + raise ValueError("Invalid model object - missing predict method") + logger.info("Model loaded successfully") + return model + except Exception as e: + logger.error(f"Model loading failed: {str(e)}") + raise + + def preprocess_input(self, input_data: Dict[str, Any]) -> pd.DataFrame: + """Convert and validate input data""" + df = pd.DataFrame([input_data]) + + # Type conversion + for field in FEATURE_CONFIG['numeric_features']: + if field in df.columns: + df[field] = pd.to_numeric(df[field], errors='coerce') + + # Handle datetime features + if 'trans_date_trans_time' in df.columns: + df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) + df['hour'] = df['trans_date_trans_time'].dt.hour + df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek + df['month'] = df['trans_date_trans_time'].dt.month + + # Age calculation + if 'dob' in df.columns: + df['dob'] = pd.to_datetime(df['dob']) + df['age'] = (pd.Timestamp.now() - df['dob']).dt.days / 365.25 + + # Geospatial features + if all(col in df.columns for col in ['lat', 'long', 'merch_lat', 'merch_long']): + df['distance'] = np.sqrt( + (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + + (df['long'].astype(float) - df['merch_long'].astype(float))**2 + ) + + # Ensure all expected features exist + for feature in self.required_features: + if feature not in df.columns: + df[feature] = 0 # Safe default + logger.warning(f"Missing feature filled with default: {feature}") + + return df.drop(FEATURE_CONFIG['drop_columns'], axis=1, errors='ignore') + + def predict(self, input_data: Dict[str, Any]) -> PredictionResult: + """Make fraud prediction with probabilities""" + try: + processed_data = self.preprocess_input(input_data) + probabilities = self.model.predict_proba(processed_data) + fraud_prob = probabilities[0][1] * 100 + prediction = self.model.predict(processed_data)[0] + + confidence = "high" if fraud_prob > 75 else "medium" if fraud_prob > 25 else "low" + + return PredictionResult( + is_fraud=bool(prediction), + probability=round(fraud_prob, 2), + confidence=confidence + ) + + except Exception as e: + logger.error(f"Prediction failed: {str(e)}") + raise + +# Initialize service +try: + fraud_service = FraudDetectionService(MODEL_PATH) +except Exception as e: + logger.critical(f"Service initialization failed: {str(e)}") + raise @app.route('/') def home(): @@ -55,36 +108,27 @@ def home(): @app.route('/predict', methods=['POST']) def predict(): try: - # Get data from form - data = request.form.to_dict() - print("Received data:", data) # Debugging + input_data = request.form.to_dict() + logger.info(f"Received prediction request: {input_data}") - # Preprocess the input - processed_data = preprocess_input(data) - print("Processed data:", processed_data) # Debugging + result = fraud_service.predict(input_data) - # Get prediction probabilities - probabilities = model.predict_proba(processed_data) - print("Raw probabilities:", probabilities) # Debugging - - # Extract fraud probability (class 1) - fraud_probability = probabilities[0][1] * 100 # Convert to percentage - - # Make prediction - prediction = model.predict(processed_data)[0] - - result = { - 'prediction': bool(prediction), - 'probability': float(fraud_probability), - 'is_fraud': bool(prediction) + response = { + "is_fraud": result.is_fraud, + "probability": result.probability, + "confidence": result.confidence, + "status": "success" } - print("Prediction result:", result) # Debugging - return render_template('index.html', prediction=result) + return render_template('index.html', prediction=response) except Exception as e: - print("Prediction error:", str(e)) # Debugging - return f"Error: {str(e)}", 400 + logger.error(f"Prediction error: {str(e)}") + return jsonify({ + "status": "error", + "message": "Failed to process prediction", + "error": str(e) + }), 400 if __name__ == '__main__': - app.run(debug=True) \ No newline at end of file + app.run(host='0.0.0.0', port=5000, debug=True) \ No newline at end of file diff --git a/src/api/inference.py b/src/api/inference.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/config.py b/src/config.py index b00f267..6456749 100644 --- a/src/config.py +++ b/src/config.py @@ -1,64 +1,122 @@ import os from pathlib import Path +from typing import Dict, Any, List +from sklearn.ensemble import RandomForestClassifier -# Directory Paths -BASE_DIR = Path(__file__).parent.parent -DATA_DIR = BASE_DIR / 'data' -RAW_DATA_DIR = DATA_DIR / 'raw' -PROCESSED_DATA_DIR = DATA_DIR / 'processed' -MODELS_DIR = BASE_DIR / 'models' -REPORTS_DIR = BASE_DIR / 'reports' -FIGURES_DIR = REPORTS_DIR / 'figures' +class Config: + """Centralized configuration for the fraud detection system""" + + # ========== Directory Structure ========== + BASE_DIR = Path(__file__).parent.parent + DATA_DIR = BASE_DIR / 'data' + RAW_DATA_DIR = DATA_DIR / 'raw' + PROCESSED_DATA_DIR = DATA_DIR / 'processed' + MODELS_DIR = BASE_DIR / 'models' + REPORTS_DIR = BASE_DIR / 'reports' + FIGURES_DIR = REPORTS_DIR / 'figures' + LOGS_DIR = BASE_DIR / 'logs' + + # ========== File Paths ========== + TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv' + TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv' + TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv' + TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv' + MODEL_PATH = MODELS_DIR / 'fraud_model.pkl' + ERROR_LOG_PATH = LOGS_DIR / 'errors.log' + + # ========== Model Configuration ========== + MODEL_PARAMS: Dict[str, Any] = { + 'classifier': RandomForestClassifier, + 'classifier_params': { + 'n_estimators': 100, + 'max_depth': None, + 'min_samples_split': 2, + 'class_weight': 'balanced', + 'random_state': 42, + 'n_jobs': -1 + } + } + + # ========== Feature Engineering ========== + NUMERIC_FEATURES: List[str] = [ + 'amt', 'city_pop', 'hour', + 'day_of_week', 'month', 'age', + 'distance' + ] + + CATEGORICAL_FEATURES: List[str] = [ + 'category', 'gender', 'job', 'merchant' + ] + + FEATURE_CONFIG: Dict[str, Any] = { + 'numeric_features': NUMERIC_FEATURES, + 'categorical_features': CATEGORICAL_FEATURES, + 'time_features': ['hour', 'day_of_week', 'month'], + 'demographic_features': ['age'], + 'geographic_features': ['distance'], + 'behavioral_features': ['txn_count_24h', 'time_since_last_txn'], + 'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], + 'required_features': NUMERIC_FEATURES + CATEGORICAL_FEATURES + } + + # ========== Prediction Configuration ========== + PREDICTION_THRESHOLDS: Dict[str, float] = { + 'high_risk': 0.75, + 'medium_risk': 0.25, + 'low_risk': 0.01 + } + + # ========== Flask Configuration ========== + class FlaskConfig: + DEBUG = os.getenv('FLASK_DEBUG', 'True') == 'True' + SECRET_KEY = os.getenv('FLASK_SECRET_KEY', 'your-secret-key-here') + MAX_CONTENT_LENGTH = int(os.getenv('MAX_UPLOAD_SIZE', 16 * 1024 * 1024)) # 16MB default + JSONIFY_PRETTYPRINT_REGULAR = True + SERVER_NAME = os.getenv('FLASK_SERVER_NAME', None) + + # ========== Logging Configuration ========== + LOGGING_CONFIG: Dict[str, Any] = { + 'version': 1, + 'formatters': { + 'default': { + 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', + } + }, + 'handlers': { + 'file': { + 'class': 'logging.FileHandler', + 'filename': ERROR_LOG_PATH, + 'formatter': 'default' + }, + 'console': { + 'class': 'logging.StreamHandler', + 'formatter': 'default' + } + }, + 'root': { + 'level': os.getenv('LOG_LEVEL', 'INFO'), + 'handlers': ['file', 'console'] + } + } + + # ========== Constants ========== + DATE_FORMAT: str = '%Y-%m-%d %H:%M:%S' + RANDOM_STATE: int = 42 + TEST_SIZE: float = 0.2 + + @classmethod + def init_directories(cls): + """Ensure all required directories exist""" + required_dirs = [ + cls.RAW_DATA_DIR, + cls.PROCESSED_DATA_DIR, + cls.MODELS_DIR, + cls.FIGURES_DIR, + cls.LOGS_DIR + ] + + for directory in required_dirs: + directory.mkdir(parents=True, exist_ok=True) -# File Paths -TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv' -TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv' -TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv' -TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv' -MODEL_PATH = MODELS_DIR / 'fraud_model.pkl' - -# Model Configuration -MODEL_PARAMS = { - 'classifier': RandomForestClassifier, - 'classifier_params': { - 'n_estimators': 100, - 'max_depth': None, - 'min_samples_split': 2, - 'class_weight': 'balanced', - 'random_state': 42, - 'n_jobs': -1 - }, - 'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'], - 'categorical_features': ['category', 'gender', 'job', 'merchant'] -} - -# Feature Engineering Configuration -FEATURE_CONFIG = { - 'time_features': ['hour', 'day_of_week', 'month'], - 'demographic_features': ['age'], - 'geographic_features': ['distance'], - 'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'] -} - -# Evaluation Metrics -EVAL_METRICS = { - 'threshold': 0.5, - 'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'], - 'target_names': ['Legitimate', 'Fraud'] -} - -# Flask App Configuration -class FlaskConfig: - DEBUG = True - SECRET_KEY = 'your-secret-key-here' - MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB upload limit - JSONIFY_PRETTYPRINT_REGULAR = True - -# Create directories if they don't exist -for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]: - directory.mkdir(parents=True, exist_ok=True) - -# Constants -DATE_FORMAT = '%Y-%m-%d %H:%M:%S' -RANDOM_STATE = 42 -TEST_SIZE = 0.2 \ No newline at end of file +# Initialize directories on import +Config.init_directories() \ No newline at end of file diff --git a/src/data_preprocessing.py b/src/data_preprocessing.py index 066e6c2..b0ea597 100644 --- a/src/data_preprocessing.py +++ b/src/data_preprocessing.py @@ -1,14 +1,17 @@ import pandas as pd import numpy as np from datetime import datetime +from config import TRAIN_DATA_PATH, TEST_DATA_PATH # Load data with proper dtype specification def load_data(filepath): return pd.read_csv(filepath, low_memory=False) -train_df = load_data('data/raw/fraudTrain.csv') -test_df = load_data('data/raw/fraudTest.csv') + +train_df = load_data(TRAIN_DATA_PATH) +test_df = load_data(TEST_DATA_PATH) + # Data cleaning function def clean_data(df): diff --git a/src/exceptions.py b/src/exceptions.py new file mode 100644 index 0000000..ab46897 --- /dev/null +++ b/src/exceptions.py @@ -0,0 +1,11 @@ +# exceptions.py +class FraudDetectionError(Exception): + """Base exception class""" + + +class DataValidationError(FraudDetectionError): + """Raised when data fails validation""" + + +class ModelServeError(FraudDetectionError): + """API prediction failures""" \ No newline at end of file diff --git a/src/model_training.py b/src/model_training.py index 398ce71..2298c01 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -7,6 +7,7 @@ from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline import joblib from datetime import datetime +from config import TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH def load_data(train_file, test_file): @@ -20,25 +21,6 @@ def load_data(train_file, test_file): return train_df, test_df -def feature_engineering(df): - # Convert transaction time to datetime - df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) - - # Extract time features - df['hour'] = df['trans_date_trans_time'].dt.hour - df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek - df['month'] = df['trans_date_trans_time'].dt.month - - # Calculate age from dob - df['dob'] = pd.to_datetime(df['dob']) - df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 - - # Calculate distance between user and merchant - df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2) - - return df - - def train_model(train_df, test_df): # Define features and target X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) @@ -79,11 +61,7 @@ def train_model(train_df, test_df): def main(): # Load data - train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv') - - # Feature engineering - train_df = feature_engineering(train_df) - test_df = feature_engineering(test_df) + train_df, test_df = load_data(TRAIN_PROCESSED_PATH, TEST_PROCESSED_PATH) # Print dataset sizes after cleaning print(f"Training samples after cleaning: {len(train_df)}") diff --git a/src/predict.py b/src/predict.py deleted file mode 100644 index e69de29..0000000