First commit

Defined file structure and completed EDA
2025-04-24 23:39:36 +01:00
commit 50e95445fb
21 changed files with 1514 additions and 0 deletions
@@ -0,0 +1,96 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import pandas as pd
+import numpy as np
+import joblib
+from pathlib import Path
+from typing import Optional
+
+from config import MODELS_DIR
+from data_preprocessing import prepare_data
+
+app = FastAPI(title="Fraud Detection API",
+             description="API for detecting fraudulent transactions",
+             version="1.0.0")
+
+class Transaction(BaseModel):
+    trans_date_trans_time: str
+    cc_num: str
+    merchant: str
+    category: str
+    amt: float
+    first: str
+    last: str
+    gender: str
+    street: str
+    city: str
+    state: str
+    zip: str
+    lat: float
+    long: float
+    city_pop: int
+    job: str
+    dob: str
+    trans_num: str
+    unix_time: int
+    merch_lat: float
+    merch_long: float
+
+class PredictionResponse(BaseModel):
+    is_fraud: bool
+    fraud_probability: float
+    confidence: str
+
+def load_model():
+    """Load the trained model and preprocessor."""
+    try:
+        model = joblib.load(MODELS_DIR / "fraud_model.joblib")
+        preprocessor = joblib.load(MODELS_DIR / "preprocessor.joblib")
+        return model, preprocessor
+    except FileNotFoundError:
+        raise HTTPException(status_code=500, detail="Model not found. Please train the model first.")
+
+def get_confidence_level(probability: float) -> str:
+    """Convert probability to confidence level."""
+    if probability >= 0.9:
+        return "Very High"
+    elif probability >= 0.7:
+        return "High"
+    elif probability >= 0.5:
+        return "Medium"
+    else:
+        return "Low"
+
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Fraud Detection API"}
+
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(transaction: Transaction):
+    """Predict whether a transaction is fraudulent."""
+    try:
+        # Load model and preprocessor
+        model, preprocessor = load_model()
+        
+        # Convert transaction to DataFrame
+        transaction_dict = transaction.dict()
+        df = pd.DataFrame([transaction_dict])
+        
+        # Prepare data for prediction
+        X, _, _ = prepare_data(df, preprocessor=preprocessor)
+        
+        # Make prediction
+        probability = model.predict_proba(X)[0, 1]
+        is_fraud = probability >= 0.5
+        
+        return PredictionResponse(
+            is_fraud=bool(is_fraud),
+            fraud_probability=float(probability),
+            confidence=get_confidence_level(probability)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+
+# Project paths
+ROOT_DIR = Path(__file__).parent.parent
+DATA_DIR = ROOT_DIR / "data"
+RAW_DATA_DIR = DATA_DIR / "raw"
+PROCESSED_DATA_DIR = DATA_DIR / "processed"
+MODELS_DIR = ROOT_DIR / "models"
+
+# Data files
+TRAIN_DATA_PATH = RAW_DATA_DIR / "fraudTrain.csv"
+TEST_DATA_PATH = RAW_DATA_DIR / "fraudTest.csv"
+
+# Model parameters
+RANDOM_STATE = 42
+TEST_SIZE = 0.2
+
+# Feature engineering parameters
+CATEGORICAL_FEATURES = ['merchant', 'category', 'gender', 'job', 'state']
+NUMERICAL_FEATURES = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']
+TIME_FEATURES = ['trans_date_trans_time']
+
+# API settings
+API_HOST = "0.0.0.0"
+API_PORT = 8000
@@ -0,0 +1,112 @@
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+import joblib
+from pathlib import Path
+
+from config import (
+    CATEGORICAL_FEATURES,
+    NUMERICAL_FEATURES,
+    TIME_FEATURES,
+    PROCESSED_DATA_DIR,
+    MODELS_DIR
+)
+
+def calculate_distance(lat1, lon1, lat2, lon2):
+    """Calculate the Haversine distance between two points."""
+    R = 6371  # Earth's radius in kilometers
+    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
+    dlat = lat2 - lat1
+    dlon = lon2 - lon1
+    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
+    c = 2 * np.arcsin(np.sqrt(a))
+    return R * c
+
+def extract_time_features(df):
+    """Extract time-based features from transaction timestamp."""
+    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+    df['hour'] = df['trans_date_trans_time'].dt.hour
+    df['day'] = df['trans_date_trans_time'].dt.day
+    df['weekday'] = df['trans_date_trans_time'].dt.weekday
+    df['month'] = df['trans_date_trans_time'].dt.month
+    return df
+
+def calculate_age(dob):
+    """Calculate age from date of birth."""
+    today = datetime.now()
+    return today.year - pd.to_datetime(dob).dt.year
+
+def preprocess_data(df):
+    """Preprocess the input dataframe."""
+    # Create a copy to avoid modifying the original
+    df = df.copy()
+    
+    # Extract time features
+    df = extract_time_features(df)
+    
+    # Calculate age
+    df['age'] = calculate_age(df['dob'])
+    
+    # Calculate distance between user and merchant
+    df['distance'] = calculate_distance(
+        df['lat'], df['long'],
+        df['merch_lat'], df['merch_long']
+    )
+    
+    # Drop unnecessary columns
+    columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'street', 'city', 
+                      'zip', 'trans_num', 'unix_time', 'dob', 'cc_num']
+    df = df.drop(columns=columns_to_drop, errors='ignore')
+    
+    return df
+
+def create_preprocessing_pipeline():
+    """Create and return a preprocessing pipeline."""
+    numeric_transformer = Pipeline(steps=[
+        ('scaler', StandardScaler())
+    ])
+    
+    categorical_transformer = Pipeline(steps=[
+        ('onehot', OneHotEncoder(handle_unknown='ignore'))
+    ])
+    
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, NUMERICAL_FEATURES + ['age', 'distance', 'hour', 'day', 'weekday', 'month']),
+            ('cat', categorical_transformer, CATEGORICAL_FEATURES)
+        ])
+    
+    return preprocessor
+
+def save_preprocessor(preprocessor, filename='preprocessor.joblib'):
+    """Save the preprocessor to disk."""
+    MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    joblib.dump(preprocessor, MODELS_DIR / filename)
+
+def load_preprocessor(filename='preprocessor.joblib'):
+    """Load the preprocessor from disk."""
+    return joblib.load(MODELS_DIR / filename)
+
+def prepare_data(df, preprocessor=None, fit=False):
+    """Prepare data for model training or prediction."""
+    # Preprocess the data
+    df_processed = preprocess_data(df)
+    
+    # Separate features and target
+    X = df_processed.drop(columns=['is_fraud'], errors='ignore')
+    y = df_processed['is_fraud'] if 'is_fraud' in df_processed.columns else None
+    
+    # Transform features
+    if preprocessor is None:
+        preprocessor = create_preprocessing_pipeline()
+    
+    if fit:
+        X_transformed = preprocessor.fit_transform(X)
+        save_preprocessor(preprocessor)
+    else:
+        X_transformed = preprocessor.transform(X)
+    
+    return X_transformed, y, preprocessor
@@ -0,0 +1,103 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
+import xgboost as xgb
+import joblib
+from pathlib import Path
+
+from config import (
+    TRAIN_DATA_PATH,
+    TEST_DATA_PATH,
+    MODELS_DIR,
+    RANDOM_STATE,
+    TEST_SIZE
+)
+from data_preprocessing import prepare_data
+
+def load_data():
+    """Load and prepare the training and test data."""
+    # Load data
+    train_df = pd.read_csv(TRAIN_DATA_PATH)
+    test_df = pd.read_csv(TEST_DATA_PATH)
+    
+    # Prepare training data
+    X_train, y_train, preprocessor = prepare_data(train_df, fit=True)
+    
+    # Prepare test data
+    X_test, y_test, _ = prepare_data(test_df, preprocessor=preprocessor)
+    
+    return X_train, y_train, X_test, y_test
+
+def train_model(X_train, y_train):
+    """Train the XGBoost model."""
+    # Define model parameters
+    params = {
+        'objective': 'binary:logistic',
+        'eval_metric': 'auc',
+        'max_depth': 6,
+        'learning_rate': 0.1,
+        'n_estimators': 100,
+        'subsample': 0.8,
+        'colsample_bytree': 0.8,
+        'random_state': RANDOM_STATE
+    }
+    
+    # Create and train the model
+    model = xgb.XGBClassifier(**params)
+    model.fit(X_train, y_train)
+    
+    return model
+
+def evaluate_model(model, X_test, y_test):
+    """Evaluate the model performance."""
+    # Make predictions
+    y_pred = model.predict(X_test)
+    y_pred_proba = model.predict_proba(X_test)[:, 1]
+    
+    # Calculate metrics
+    print("Classification Report:")
+    print(classification_report(y_test, y_pred))
+    
+    print("\nConfusion Matrix:")
+    print(confusion_matrix(y_test, y_pred))
+    
+    print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
+    
+    return {
+        'classification_report': classification_report(y_test, y_pred, output_dict=True),
+        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
+        'roc_auc_score': roc_auc_score(y_test, y_pred_proba)
+    }
+
+def save_model(model, metrics, filename='fraud_model.joblib'):
+    """Save the trained model and its metrics."""
+    MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    
+    # Save the model
+    joblib.dump(model, MODELS_DIR / filename)
+    
+    # Save metrics
+    metrics_file = MODELS_DIR / 'model_metrics.json'
+    import json
+    with open(metrics_file, 'w') as f:
+        json.dump(metrics, f)
+
+def main():
+    """Main function to train and evaluate the model."""
+    print("Loading data...")
+    X_train, y_train, X_test, y_test = load_data()
+    
+    print("Training model...")
+    model = train_model(X_train, y_train)
+    
+    print("Evaluating model...")
+    metrics = evaluate_model(model, X_test, y_test)
+    
+    print("Saving model and metrics...")
+    save_model(model, metrics)
+    
+    print("Training completed successfully!")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,129 @@
+import streamlit as st
+import pandas as pd
+import requests
+import json
+from datetime import datetime
+import random
+
+# API endpoint
+API_URL = "http://localhost:8000/predict"
+
+# Sample data for testing
+SAMPLE_TRANSACTION = {
+    "trans_date_trans_time": "2020-06-21 12:14:25",
+    "cc_num": "1234567890123456",
+    "merchant": "fraud_Rippin, Kub and Mann",
+    "category": "misc_net",
+    "amt": 4.97,
+    "first": "Jennifer",
+    "last": "Banks",
+    "gender": "F",
+    "street": "561 Perry Cove",
+    "city": "Moravian Falls",
+    "state": "NC",
+    "zip": "28654",
+    "lat": 36.0788,
+    "long": -81.1781,
+    "city_pop": 3495,
+    "job": "Psychologist, counselling",
+    "dob": "1988-03-09",
+    "trans_num": "0b242abb623afc578575680df30655b9",
+    "unix_time": 1371816885,
+    "merch_lat": 36.011293,
+    "merch_long": -82.048315
+}
+
+def main():
+    st.title("Fraud Detection System")
+    st.write("Enter transaction details to check for potential fraud.")
+    
+    # Create form for transaction details
+    with st.form("transaction_form"):
+        col1, col2 = st.columns(2)
+        
+        with col1:
+            st.subheader("Transaction Details")
+            trans_date = st.date_input("Transaction Date", datetime.now())
+            trans_time = st.time_input("Transaction Time", datetime.now().time())
+            merchant = st.text_input("Merchant", SAMPLE_TRANSACTION["merchant"])
+            category = st.text_input("Category", SAMPLE_TRANSACTION["category"])
+            amount = st.number_input("Amount", value=SAMPLE_TRANSACTION["amt"], min_value=0.0)
+        
+        with col2:
+            st.subheader("Cardholder Details")
+            first_name = st.text_input("First Name", SAMPLE_TRANSACTION["first"])
+            last_name = st.text_input("Last Name", SAMPLE_TRANSACTION["last"])
+            gender = st.selectbox("Gender", ["M", "F"], index=1)
+            dob = st.date_input("Date of Birth", datetime.strptime(SAMPLE_TRANSACTION["dob"], "%Y-%m-%d"))
+            job = st.text_input("Job", SAMPLE_TRANSACTION["job"])
+        
+        st.subheader("Location Details")
+        col3, col4 = st.columns(2)
+        
+        with col3:
+            street = st.text_input("Street", SAMPLE_TRANSACTION["street"])
+            city = st.text_input("City", SAMPLE_TRANSACTION["city"])
+            state = st.text_input("State", SAMPLE_TRANSACTION["state"])
+            zip_code = st.text_input("ZIP Code", SAMPLE_TRANSACTION["zip"])
+            lat = st.number_input("Latitude", value=SAMPLE_TRANSACTION["lat"])
+            long = st.number_input("Longitude", value=SAMPLE_TRANSACTION["long"])
+            city_pop = st.number_input("City Population", value=SAMPLE_TRANSACTION["city_pop"])
+        
+        with col4:
+            merch_lat = st.number_input("Merchant Latitude", value=SAMPLE_TRANSACTION["merch_lat"])
+            merch_long = st.number_input("Merchant Longitude", value=SAMPLE_TRANSACTION["merch_long"])
+        
+        submitted = st.form_submit_button("Check for Fraud")
+    
+    if submitted:
+        # Prepare transaction data
+        transaction = {
+            "trans_date_trans_time": f"{trans_date} {trans_time}",
+            "cc_num": str(random.randint(1000000000000000, 9999999999999999)),
+            "merchant": merchant,
+            "category": category,
+            "amt": float(amount),
+            "first": first_name,
+            "last": last_name,
+            "gender": gender,
+            "street": street,
+            "city": city,
+            "state": state,
+            "zip": zip_code,
+            "lat": float(lat),
+            "long": float(long),
+            "city_pop": int(city_pop),
+            "job": job,
+            "dob": dob.strftime("%Y-%m-%d"),
+            "trans_num": f"{random.getrandbits(128):032x}",
+            "unix_time": int(datetime.combine(trans_date, trans_time).timestamp()),
+            "merch_lat": float(merch_lat),
+            "merch_long": float(merch_long)
+        }
+        
+        try:
+            # Send request to API
+            response = requests.post(API_URL, json=transaction)
+            result = response.json()
+            
+            # Display results
+            st.subheader("Fraud Detection Results")
+            
+            if result["is_fraud"]:
+                st.error(f"⚠️ Fraudulent Transaction Detected!")
+            else:
+                st.success(f"✅ Legitimate Transaction")
+            
+            st.write(f"Fraud Probability: {result['fraud_probability']:.2%}")
+            st.write(f"Confidence Level: {result['confidence']}")
+            
+            # Display additional information
+            with st.expander("Transaction Details"):
+                st.json(transaction)
+        
+        except requests.exceptions.RequestException as e:
+            st.error(f"Error connecting to the API: {str(e)}")
+            st.info("Please make sure the API server is running.")
+
+if __name__ == "__main__":
+    main()