First commit
Defined file structure and completed EDA
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import joblib
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from config import MODELS_DIR
|
||||
from data_preprocessing import prepare_data
|
||||
|
||||
app = FastAPI(title="Fraud Detection API",
|
||||
description="API for detecting fraudulent transactions",
|
||||
version="1.0.0")
|
||||
|
||||
class Transaction(BaseModel):
|
||||
trans_date_trans_time: str
|
||||
cc_num: str
|
||||
merchant: str
|
||||
category: str
|
||||
amt: float
|
||||
first: str
|
||||
last: str
|
||||
gender: str
|
||||
street: str
|
||||
city: str
|
||||
state: str
|
||||
zip: str
|
||||
lat: float
|
||||
long: float
|
||||
city_pop: int
|
||||
job: str
|
||||
dob: str
|
||||
trans_num: str
|
||||
unix_time: int
|
||||
merch_lat: float
|
||||
merch_long: float
|
||||
|
||||
class PredictionResponse(BaseModel):
|
||||
is_fraud: bool
|
||||
fraud_probability: float
|
||||
confidence: str
|
||||
|
||||
def load_model():
|
||||
"""Load the trained model and preprocessor."""
|
||||
try:
|
||||
model = joblib.load(MODELS_DIR / "fraud_model.joblib")
|
||||
preprocessor = joblib.load(MODELS_DIR / "preprocessor.joblib")
|
||||
return model, preprocessor
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=500, detail="Model not found. Please train the model first.")
|
||||
|
||||
def get_confidence_level(probability: float) -> str:
|
||||
"""Convert probability to confidence level."""
|
||||
if probability >= 0.9:
|
||||
return "Very High"
|
||||
elif probability >= 0.7:
|
||||
return "High"
|
||||
elif probability >= 0.5:
|
||||
return "Medium"
|
||||
else:
|
||||
return "Low"
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {"message": "Welcome to the Fraud Detection API"}
|
||||
|
||||
@app.post("/predict", response_model=PredictionResponse)
|
||||
async def predict(transaction: Transaction):
|
||||
"""Predict whether a transaction is fraudulent."""
|
||||
try:
|
||||
# Load model and preprocessor
|
||||
model, preprocessor = load_model()
|
||||
|
||||
# Convert transaction to DataFrame
|
||||
transaction_dict = transaction.dict()
|
||||
df = pd.DataFrame([transaction_dict])
|
||||
|
||||
# Prepare data for prediction
|
||||
X, _, _ = prepare_data(df, preprocessor=preprocessor)
|
||||
|
||||
# Make prediction
|
||||
probability = model.predict_proba(X)[0, 1]
|
||||
is_fraud = probability >= 0.5
|
||||
|
||||
return PredictionResponse(
|
||||
is_fraud=bool(is_fraud),
|
||||
fraud_probability=float(probability),
|
||||
confidence=get_confidence_level(probability)
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
@@ -0,0 +1,26 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Project paths
|
||||
ROOT_DIR = Path(__file__).parent.parent
|
||||
DATA_DIR = ROOT_DIR / "data"
|
||||
RAW_DATA_DIR = DATA_DIR / "raw"
|
||||
PROCESSED_DATA_DIR = DATA_DIR / "processed"
|
||||
MODELS_DIR = ROOT_DIR / "models"
|
||||
|
||||
# Data files
|
||||
TRAIN_DATA_PATH = RAW_DATA_DIR / "fraudTrain.csv"
|
||||
TEST_DATA_PATH = RAW_DATA_DIR / "fraudTest.csv"
|
||||
|
||||
# Model parameters
|
||||
RANDOM_STATE = 42
|
||||
TEST_SIZE = 0.2
|
||||
|
||||
# Feature engineering parameters
|
||||
CATEGORICAL_FEATURES = ['merchant', 'category', 'gender', 'job', 'state']
|
||||
NUMERICAL_FEATURES = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']
|
||||
TIME_FEATURES = ['trans_date_trans_time']
|
||||
|
||||
# API settings
|
||||
API_HOST = "0.0.0.0"
|
||||
API_PORT = 8000
|
||||
@@ -0,0 +1,112 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
import joblib
|
||||
from pathlib import Path
|
||||
|
||||
from config import (
|
||||
CATEGORICAL_FEATURES,
|
||||
NUMERICAL_FEATURES,
|
||||
TIME_FEATURES,
|
||||
PROCESSED_DATA_DIR,
|
||||
MODELS_DIR
|
||||
)
|
||||
|
||||
def calculate_distance(lat1, lon1, lat2, lon2):
|
||||
"""Calculate the Haversine distance between two points."""
|
||||
R = 6371 # Earth's radius in kilometers
|
||||
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
|
||||
dlat = lat2 - lat1
|
||||
dlon = lon2 - lon1
|
||||
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
|
||||
c = 2 * np.arcsin(np.sqrt(a))
|
||||
return R * c
|
||||
|
||||
def extract_time_features(df):
|
||||
"""Extract time-based features from transaction timestamp."""
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day'] = df['trans_date_trans_time'].dt.day
|
||||
df['weekday'] = df['trans_date_trans_time'].dt.weekday
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
return df
|
||||
|
||||
def calculate_age(dob):
|
||||
"""Calculate age from date of birth."""
|
||||
today = datetime.now()
|
||||
return today.year - pd.to_datetime(dob).dt.year
|
||||
|
||||
def preprocess_data(df):
|
||||
"""Preprocess the input dataframe."""
|
||||
# Create a copy to avoid modifying the original
|
||||
df = df.copy()
|
||||
|
||||
# Extract time features
|
||||
df = extract_time_features(df)
|
||||
|
||||
# Calculate age
|
||||
df['age'] = calculate_age(df['dob'])
|
||||
|
||||
# Calculate distance between user and merchant
|
||||
df['distance'] = calculate_distance(
|
||||
df['lat'], df['long'],
|
||||
df['merch_lat'], df['merch_long']
|
||||
)
|
||||
|
||||
# Drop unnecessary columns
|
||||
columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'street', 'city',
|
||||
'zip', 'trans_num', 'unix_time', 'dob', 'cc_num']
|
||||
df = df.drop(columns=columns_to_drop, errors='ignore')
|
||||
|
||||
return df
|
||||
|
||||
def create_preprocessing_pipeline():
|
||||
"""Create and return a preprocessing pipeline."""
|
||||
numeric_transformer = Pipeline(steps=[
|
||||
('scaler', StandardScaler())
|
||||
])
|
||||
|
||||
categorical_transformer = Pipeline(steps=[
|
||||
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
||||
])
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', numeric_transformer, NUMERICAL_FEATURES + ['age', 'distance', 'hour', 'day', 'weekday', 'month']),
|
||||
('cat', categorical_transformer, CATEGORICAL_FEATURES)
|
||||
])
|
||||
|
||||
return preprocessor
|
||||
|
||||
def save_preprocessor(preprocessor, filename='preprocessor.joblib'):
|
||||
"""Save the preprocessor to disk."""
|
||||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
joblib.dump(preprocessor, MODELS_DIR / filename)
|
||||
|
||||
def load_preprocessor(filename='preprocessor.joblib'):
|
||||
"""Load the preprocessor from disk."""
|
||||
return joblib.load(MODELS_DIR / filename)
|
||||
|
||||
def prepare_data(df, preprocessor=None, fit=False):
|
||||
"""Prepare data for model training or prediction."""
|
||||
# Preprocess the data
|
||||
df_processed = preprocess_data(df)
|
||||
|
||||
# Separate features and target
|
||||
X = df_processed.drop(columns=['is_fraud'], errors='ignore')
|
||||
y = df_processed['is_fraud'] if 'is_fraud' in df_processed.columns else None
|
||||
|
||||
# Transform features
|
||||
if preprocessor is None:
|
||||
preprocessor = create_preprocessing_pipeline()
|
||||
|
||||
if fit:
|
||||
X_transformed = preprocessor.fit_transform(X)
|
||||
save_preprocessor(preprocessor)
|
||||
else:
|
||||
X_transformed = preprocessor.transform(X)
|
||||
|
||||
return X_transformed, y, preprocessor
|
||||
@@ -0,0 +1,103 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
|
||||
import xgboost as xgb
|
||||
import joblib
|
||||
from pathlib import Path
|
||||
|
||||
from config import (
|
||||
TRAIN_DATA_PATH,
|
||||
TEST_DATA_PATH,
|
||||
MODELS_DIR,
|
||||
RANDOM_STATE,
|
||||
TEST_SIZE
|
||||
)
|
||||
from data_preprocessing import prepare_data
|
||||
|
||||
def load_data():
|
||||
"""Load and prepare the training and test data."""
|
||||
# Load data
|
||||
train_df = pd.read_csv(TRAIN_DATA_PATH)
|
||||
test_df = pd.read_csv(TEST_DATA_PATH)
|
||||
|
||||
# Prepare training data
|
||||
X_train, y_train, preprocessor = prepare_data(train_df, fit=True)
|
||||
|
||||
# Prepare test data
|
||||
X_test, y_test, _ = prepare_data(test_df, preprocessor=preprocessor)
|
||||
|
||||
return X_train, y_train, X_test, y_test
|
||||
|
||||
def train_model(X_train, y_train):
|
||||
"""Train the XGBoost model."""
|
||||
# Define model parameters
|
||||
params = {
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'auc',
|
||||
'max_depth': 6,
|
||||
'learning_rate': 0.1,
|
||||
'n_estimators': 100,
|
||||
'subsample': 0.8,
|
||||
'colsample_bytree': 0.8,
|
||||
'random_state': RANDOM_STATE
|
||||
}
|
||||
|
||||
# Create and train the model
|
||||
model = xgb.XGBClassifier(**params)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
return model
|
||||
|
||||
def evaluate_model(model, X_test, y_test):
|
||||
"""Evaluate the model performance."""
|
||||
# Make predictions
|
||||
y_pred = model.predict(X_test)
|
||||
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
# Calculate metrics
|
||||
print("Classification Report:")
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
print("\nConfusion Matrix:")
|
||||
print(confusion_matrix(y_test, y_pred))
|
||||
|
||||
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
|
||||
|
||||
return {
|
||||
'classification_report': classification_report(y_test, y_pred, output_dict=True),
|
||||
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
|
||||
'roc_auc_score': roc_auc_score(y_test, y_pred_proba)
|
||||
}
|
||||
|
||||
def save_model(model, metrics, filename='fraud_model.joblib'):
|
||||
"""Save the trained model and its metrics."""
|
||||
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save the model
|
||||
joblib.dump(model, MODELS_DIR / filename)
|
||||
|
||||
# Save metrics
|
||||
metrics_file = MODELS_DIR / 'model_metrics.json'
|
||||
import json
|
||||
with open(metrics_file, 'w') as f:
|
||||
json.dump(metrics, f)
|
||||
|
||||
def main():
|
||||
"""Main function to train and evaluate the model."""
|
||||
print("Loading data...")
|
||||
X_train, y_train, X_test, y_test = load_data()
|
||||
|
||||
print("Training model...")
|
||||
model = train_model(X_train, y_train)
|
||||
|
||||
print("Evaluating model...")
|
||||
metrics = evaluate_model(model, X_test, y_test)
|
||||
|
||||
print("Saving model and metrics...")
|
||||
save_model(model, metrics)
|
||||
|
||||
print("Training completed successfully!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+129
@@ -0,0 +1,129 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import requests
|
||||
import json
|
||||
from datetime import datetime
|
||||
import random
|
||||
|
||||
# API endpoint
|
||||
API_URL = "http://localhost:8000/predict"
|
||||
|
||||
# Sample data for testing
|
||||
SAMPLE_TRANSACTION = {
|
||||
"trans_date_trans_time": "2020-06-21 12:14:25",
|
||||
"cc_num": "1234567890123456",
|
||||
"merchant": "fraud_Rippin, Kub and Mann",
|
||||
"category": "misc_net",
|
||||
"amt": 4.97,
|
||||
"first": "Jennifer",
|
||||
"last": "Banks",
|
||||
"gender": "F",
|
||||
"street": "561 Perry Cove",
|
||||
"city": "Moravian Falls",
|
||||
"state": "NC",
|
||||
"zip": "28654",
|
||||
"lat": 36.0788,
|
||||
"long": -81.1781,
|
||||
"city_pop": 3495,
|
||||
"job": "Psychologist, counselling",
|
||||
"dob": "1988-03-09",
|
||||
"trans_num": "0b242abb623afc578575680df30655b9",
|
||||
"unix_time": 1371816885,
|
||||
"merch_lat": 36.011293,
|
||||
"merch_long": -82.048315
|
||||
}
|
||||
|
||||
def main():
|
||||
st.title("Fraud Detection System")
|
||||
st.write("Enter transaction details to check for potential fraud.")
|
||||
|
||||
# Create form for transaction details
|
||||
with st.form("transaction_form"):
|
||||
col1, col2 = st.columns(2)
|
||||
|
||||
with col1:
|
||||
st.subheader("Transaction Details")
|
||||
trans_date = st.date_input("Transaction Date", datetime.now())
|
||||
trans_time = st.time_input("Transaction Time", datetime.now().time())
|
||||
merchant = st.text_input("Merchant", SAMPLE_TRANSACTION["merchant"])
|
||||
category = st.text_input("Category", SAMPLE_TRANSACTION["category"])
|
||||
amount = st.number_input("Amount", value=SAMPLE_TRANSACTION["amt"], min_value=0.0)
|
||||
|
||||
with col2:
|
||||
st.subheader("Cardholder Details")
|
||||
first_name = st.text_input("First Name", SAMPLE_TRANSACTION["first"])
|
||||
last_name = st.text_input("Last Name", SAMPLE_TRANSACTION["last"])
|
||||
gender = st.selectbox("Gender", ["M", "F"], index=1)
|
||||
dob = st.date_input("Date of Birth", datetime.strptime(SAMPLE_TRANSACTION["dob"], "%Y-%m-%d"))
|
||||
job = st.text_input("Job", SAMPLE_TRANSACTION["job"])
|
||||
|
||||
st.subheader("Location Details")
|
||||
col3, col4 = st.columns(2)
|
||||
|
||||
with col3:
|
||||
street = st.text_input("Street", SAMPLE_TRANSACTION["street"])
|
||||
city = st.text_input("City", SAMPLE_TRANSACTION["city"])
|
||||
state = st.text_input("State", SAMPLE_TRANSACTION["state"])
|
||||
zip_code = st.text_input("ZIP Code", SAMPLE_TRANSACTION["zip"])
|
||||
lat = st.number_input("Latitude", value=SAMPLE_TRANSACTION["lat"])
|
||||
long = st.number_input("Longitude", value=SAMPLE_TRANSACTION["long"])
|
||||
city_pop = st.number_input("City Population", value=SAMPLE_TRANSACTION["city_pop"])
|
||||
|
||||
with col4:
|
||||
merch_lat = st.number_input("Merchant Latitude", value=SAMPLE_TRANSACTION["merch_lat"])
|
||||
merch_long = st.number_input("Merchant Longitude", value=SAMPLE_TRANSACTION["merch_long"])
|
||||
|
||||
submitted = st.form_submit_button("Check for Fraud")
|
||||
|
||||
if submitted:
|
||||
# Prepare transaction data
|
||||
transaction = {
|
||||
"trans_date_trans_time": f"{trans_date} {trans_time}",
|
||||
"cc_num": str(random.randint(1000000000000000, 9999999999999999)),
|
||||
"merchant": merchant,
|
||||
"category": category,
|
||||
"amt": float(amount),
|
||||
"first": first_name,
|
||||
"last": last_name,
|
||||
"gender": gender,
|
||||
"street": street,
|
||||
"city": city,
|
||||
"state": state,
|
||||
"zip": zip_code,
|
||||
"lat": float(lat),
|
||||
"long": float(long),
|
||||
"city_pop": int(city_pop),
|
||||
"job": job,
|
||||
"dob": dob.strftime("%Y-%m-%d"),
|
||||
"trans_num": f"{random.getrandbits(128):032x}",
|
||||
"unix_time": int(datetime.combine(trans_date, trans_time).timestamp()),
|
||||
"merch_lat": float(merch_lat),
|
||||
"merch_long": float(merch_long)
|
||||
}
|
||||
|
||||
try:
|
||||
# Send request to API
|
||||
response = requests.post(API_URL, json=transaction)
|
||||
result = response.json()
|
||||
|
||||
# Display results
|
||||
st.subheader("Fraud Detection Results")
|
||||
|
||||
if result["is_fraud"]:
|
||||
st.error(f"⚠️ Fraudulent Transaction Detected!")
|
||||
else:
|
||||
st.success(f"✅ Legitimate Transaction")
|
||||
|
||||
st.write(f"Fraud Probability: {result['fraud_probability']:.2%}")
|
||||
st.write(f"Confidence Level: {result['confidence']}")
|
||||
|
||||
# Display additional information
|
||||
with st.expander("Transaction Details"):
|
||||
st.json(transaction)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
st.error(f"Error connecting to the API: {str(e)}")
|
||||
st.info("Please make sure the API server is running.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user