First commit

Defined file structure and completed EDA
This commit is contained in:
boladeE
2025-04-24 23:39:36 +01:00
commit 50e95445fb
21 changed files with 1514 additions and 0 deletions
View File
View File
+96
View File
@@ -0,0 +1,96 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from typing import Optional
from config import MODELS_DIR
from data_preprocessing import prepare_data
app = FastAPI(title="Fraud Detection API",
description="API for detecting fraudulent transactions",
version="1.0.0")
class Transaction(BaseModel):
trans_date_trans_time: str
cc_num: str
merchant: str
category: str
amt: float
first: str
last: str
gender: str
street: str
city: str
state: str
zip: str
lat: float
long: float
city_pop: int
job: str
dob: str
trans_num: str
unix_time: int
merch_lat: float
merch_long: float
class PredictionResponse(BaseModel):
is_fraud: bool
fraud_probability: float
confidence: str
def load_model():
"""Load the trained model and preprocessor."""
try:
model = joblib.load(MODELS_DIR / "fraud_model.joblib")
preprocessor = joblib.load(MODELS_DIR / "preprocessor.joblib")
return model, preprocessor
except FileNotFoundError:
raise HTTPException(status_code=500, detail="Model not found. Please train the model first.")
def get_confidence_level(probability: float) -> str:
"""Convert probability to confidence level."""
if probability >= 0.9:
return "Very High"
elif probability >= 0.7:
return "High"
elif probability >= 0.5:
return "Medium"
else:
return "Low"
@app.get("/")
async def root():
return {"message": "Welcome to the Fraud Detection API"}
@app.post("/predict", response_model=PredictionResponse)
async def predict(transaction: Transaction):
"""Predict whether a transaction is fraudulent."""
try:
# Load model and preprocessor
model, preprocessor = load_model()
# Convert transaction to DataFrame
transaction_dict = transaction.dict()
df = pd.DataFrame([transaction_dict])
# Prepare data for prediction
X, _, _ = prepare_data(df, preprocessor=preprocessor)
# Make prediction
probability = model.predict_proba(X)[0, 1]
is_fraud = probability >= 0.5
return PredictionResponse(
is_fraud=bool(is_fraud),
fraud_probability=float(probability),
confidence=get_confidence_level(probability)
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
View File
+26
View File
@@ -0,0 +1,26 @@
import os
from pathlib import Path
# Project paths
ROOT_DIR = Path(__file__).parent.parent
DATA_DIR = ROOT_DIR / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
MODELS_DIR = ROOT_DIR / "models"
# Data files
TRAIN_DATA_PATH = RAW_DATA_DIR / "fraudTrain.csv"
TEST_DATA_PATH = RAW_DATA_DIR / "fraudTest.csv"
# Model parameters
RANDOM_STATE = 42
TEST_SIZE = 0.2
# Feature engineering parameters
CATEGORICAL_FEATURES = ['merchant', 'category', 'gender', 'job', 'state']
NUMERICAL_FEATURES = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']
TIME_FEATURES = ['trans_date_trans_time']
# API settings
API_HOST = "0.0.0.0"
API_PORT = 8000
+112
View File
@@ -0,0 +1,112 @@
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from pathlib import Path
from config import (
CATEGORICAL_FEATURES,
NUMERICAL_FEATURES,
TIME_FEATURES,
PROCESSED_DATA_DIR,
MODELS_DIR
)
def calculate_distance(lat1, lon1, lat2, lon2):
"""Calculate the Haversine distance between two points."""
R = 6371 # Earth's radius in kilometers
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
return R * c
def extract_time_features(df):
"""Extract time-based features from transaction timestamp."""
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day'] = df['trans_date_trans_time'].dt.day
df['weekday'] = df['trans_date_trans_time'].dt.weekday
df['month'] = df['trans_date_trans_time'].dt.month
return df
def calculate_age(dob):
"""Calculate age from date of birth."""
today = datetime.now()
return today.year - pd.to_datetime(dob).dt.year
def preprocess_data(df):
"""Preprocess the input dataframe."""
# Create a copy to avoid modifying the original
df = df.copy()
# Extract time features
df = extract_time_features(df)
# Calculate age
df['age'] = calculate_age(df['dob'])
# Calculate distance between user and merchant
df['distance'] = calculate_distance(
df['lat'], df['long'],
df['merch_lat'], df['merch_long']
)
# Drop unnecessary columns
columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'street', 'city',
'zip', 'trans_num', 'unix_time', 'dob', 'cc_num']
df = df.drop(columns=columns_to_drop, errors='ignore')
return df
def create_preprocessing_pipeline():
"""Create and return a preprocessing pipeline."""
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, NUMERICAL_FEATURES + ['age', 'distance', 'hour', 'day', 'weekday', 'month']),
('cat', categorical_transformer, CATEGORICAL_FEATURES)
])
return preprocessor
def save_preprocessor(preprocessor, filename='preprocessor.joblib'):
"""Save the preprocessor to disk."""
MODELS_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(preprocessor, MODELS_DIR / filename)
def load_preprocessor(filename='preprocessor.joblib'):
"""Load the preprocessor from disk."""
return joblib.load(MODELS_DIR / filename)
def prepare_data(df, preprocessor=None, fit=False):
"""Prepare data for model training or prediction."""
# Preprocess the data
df_processed = preprocess_data(df)
# Separate features and target
X = df_processed.drop(columns=['is_fraud'], errors='ignore')
y = df_processed['is_fraud'] if 'is_fraud' in df_processed.columns else None
# Transform features
if preprocessor is None:
preprocessor = create_preprocessing_pipeline()
if fit:
X_transformed = preprocessor.fit_transform(X)
save_preprocessor(preprocessor)
else:
X_transformed = preprocessor.transform(X)
return X_transformed, y, preprocessor
View File
+103
View File
@@ -0,0 +1,103 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
import joblib
from pathlib import Path
from config import (
TRAIN_DATA_PATH,
TEST_DATA_PATH,
MODELS_DIR,
RANDOM_STATE,
TEST_SIZE
)
from data_preprocessing import prepare_data
def load_data():
"""Load and prepare the training and test data."""
# Load data
train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)
# Prepare training data
X_train, y_train, preprocessor = prepare_data(train_df, fit=True)
# Prepare test data
X_test, y_test, _ = prepare_data(test_df, preprocessor=preprocessor)
return X_train, y_train, X_test, y_test
def train_model(X_train, y_train):
"""Train the XGBoost model."""
# Define model parameters
params = {
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth': 6,
'learning_rate': 0.1,
'n_estimators': 100,
'subsample': 0.8,
'colsample_bytree': 0.8,
'random_state': RANDOM_STATE
}
# Create and train the model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)
return model
def evaluate_model(model, X_test, y_test):
"""Evaluate the model performance."""
# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Calculate metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
return {
'classification_report': classification_report(y_test, y_pred, output_dict=True),
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
'roc_auc_score': roc_auc_score(y_test, y_pred_proba)
}
def save_model(model, metrics, filename='fraud_model.joblib'):
"""Save the trained model and its metrics."""
MODELS_DIR.mkdir(parents=True, exist_ok=True)
# Save the model
joblib.dump(model, MODELS_DIR / filename)
# Save metrics
metrics_file = MODELS_DIR / 'model_metrics.json'
import json
with open(metrics_file, 'w') as f:
json.dump(metrics, f)
def main():
"""Main function to train and evaluate the model."""
print("Loading data...")
X_train, y_train, X_test, y_test = load_data()
print("Training model...")
model = train_model(X_train, y_train)
print("Evaluating model...")
metrics = evaluate_model(model, X_test, y_test)
print("Saving model and metrics...")
save_model(model, metrics)
print("Training completed successfully!")
if __name__ == "__main__":
main()
View File
+129
View File
@@ -0,0 +1,129 @@
import streamlit as st
import pandas as pd
import requests
import json
from datetime import datetime
import random
# API endpoint
API_URL = "http://localhost:8000/predict"
# Sample data for testing
SAMPLE_TRANSACTION = {
"trans_date_trans_time": "2020-06-21 12:14:25",
"cc_num": "1234567890123456",
"merchant": "fraud_Rippin, Kub and Mann",
"category": "misc_net",
"amt": 4.97,
"first": "Jennifer",
"last": "Banks",
"gender": "F",
"street": "561 Perry Cove",
"city": "Moravian Falls",
"state": "NC",
"zip": "28654",
"lat": 36.0788,
"long": -81.1781,
"city_pop": 3495,
"job": "Psychologist, counselling",
"dob": "1988-03-09",
"trans_num": "0b242abb623afc578575680df30655b9",
"unix_time": 1371816885,
"merch_lat": 36.011293,
"merch_long": -82.048315
}
def main():
st.title("Fraud Detection System")
st.write("Enter transaction details to check for potential fraud.")
# Create form for transaction details
with st.form("transaction_form"):
col1, col2 = st.columns(2)
with col1:
st.subheader("Transaction Details")
trans_date = st.date_input("Transaction Date", datetime.now())
trans_time = st.time_input("Transaction Time", datetime.now().time())
merchant = st.text_input("Merchant", SAMPLE_TRANSACTION["merchant"])
category = st.text_input("Category", SAMPLE_TRANSACTION["category"])
amount = st.number_input("Amount", value=SAMPLE_TRANSACTION["amt"], min_value=0.0)
with col2:
st.subheader("Cardholder Details")
first_name = st.text_input("First Name", SAMPLE_TRANSACTION["first"])
last_name = st.text_input("Last Name", SAMPLE_TRANSACTION["last"])
gender = st.selectbox("Gender", ["M", "F"], index=1)
dob = st.date_input("Date of Birth", datetime.strptime(SAMPLE_TRANSACTION["dob"], "%Y-%m-%d"))
job = st.text_input("Job", SAMPLE_TRANSACTION["job"])
st.subheader("Location Details")
col3, col4 = st.columns(2)
with col3:
street = st.text_input("Street", SAMPLE_TRANSACTION["street"])
city = st.text_input("City", SAMPLE_TRANSACTION["city"])
state = st.text_input("State", SAMPLE_TRANSACTION["state"])
zip_code = st.text_input("ZIP Code", SAMPLE_TRANSACTION["zip"])
lat = st.number_input("Latitude", value=SAMPLE_TRANSACTION["lat"])
long = st.number_input("Longitude", value=SAMPLE_TRANSACTION["long"])
city_pop = st.number_input("City Population", value=SAMPLE_TRANSACTION["city_pop"])
with col4:
merch_lat = st.number_input("Merchant Latitude", value=SAMPLE_TRANSACTION["merch_lat"])
merch_long = st.number_input("Merchant Longitude", value=SAMPLE_TRANSACTION["merch_long"])
submitted = st.form_submit_button("Check for Fraud")
if submitted:
# Prepare transaction data
transaction = {
"trans_date_trans_time": f"{trans_date} {trans_time}",
"cc_num": str(random.randint(1000000000000000, 9999999999999999)),
"merchant": merchant,
"category": category,
"amt": float(amount),
"first": first_name,
"last": last_name,
"gender": gender,
"street": street,
"city": city,
"state": state,
"zip": zip_code,
"lat": float(lat),
"long": float(long),
"city_pop": int(city_pop),
"job": job,
"dob": dob.strftime("%Y-%m-%d"),
"trans_num": f"{random.getrandbits(128):032x}",
"unix_time": int(datetime.combine(trans_date, trans_time).timestamp()),
"merch_lat": float(merch_lat),
"merch_long": float(merch_long)
}
try:
# Send request to API
response = requests.post(API_URL, json=transaction)
result = response.json()
# Display results
st.subheader("Fraud Detection Results")
if result["is_fraud"]:
st.error(f"⚠️ Fraudulent Transaction Detected!")
else:
st.success(f"✅ Legitimate Transaction")
st.write(f"Fraud Probability: {result['fraud_probability']:.2%}")
st.write(f"Confidence Level: {result['confidence']}")
# Display additional information
with st.expander("Transaction Details"):
st.json(transaction)
except requests.exceptions.RequestException as e:
st.error(f"Error connecting to the API: {str(e)}")
st.info("Please make sure the API server is running.")
if __name__ == "__main__":
main()