From 6c1224eacaa23b0058354109839fcf3cd2fcf470 Mon Sep 17 00:00:00 2001 From: Ayomide Date: Fri, 25 Jul 2025 20:10:44 +0100 Subject: [PATCH] update project structure and improve scripts --- .gitignore | 3 + Dockerfile | 0 app.py | 79 -------------- deployment/cloud_run.sh | 0 deployment/docker-compose.yml | 0 src/api/app.py | 90 ++++++++++++++++ src/config.py | 64 +++++++++++ src/data_preprocessing.py | 68 ++++++++++++ src/model_evaluation.py | 111 ++++++++++++++++++++ src/model_training.py | 100 ++++++++++++++++++ src/web/app.py | 0 {static => src/web/static}/style.css | 0 {templates => src/web/templates}/index.html | 0 train_model.py | 100 ------------------ 14 files changed, 436 insertions(+), 179 deletions(-) delete mode 100644 Dockerfile delete mode 100644 app.py delete mode 100644 deployment/cloud_run.sh delete mode 100644 deployment/docker-compose.yml delete mode 100644 src/web/app.py rename {static => src/web/static}/style.css (100%) rename {templates => src/web/templates}/index.html (100%) delete mode 100644 train_model.py diff --git a/.gitignore b/.gitignore index f561a7d..b4edb9a 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,9 @@ data/processed/ *.feather *.parquet +# Report files +reports/ + # Model files models/ !models/README.md diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index e69de29..0000000 diff --git a/app.py b/app.py deleted file mode 100644 index 06ff7fd..0000000 --- a/app.py +++ /dev/null @@ -1,79 +0,0 @@ -from flask import Flask, render_template, request, jsonify -import joblib -import pandas as pd -import numpy as np -from datetime import datetime - -app = Flask(__name__) - -# Load the model -try: - model = joblib.load('models/fraud_model.pkl') -except Exception as e: - print(f"Error loading model: {e}") - raise - - -def preprocess_input(data): - # Convert to DataFrame - df = pd.DataFrame([data]) - - # Convert numeric fields explicitly - numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long'] - for field in numeric_fields: - df[field] = pd.to_numeric(df[field], errors='coerce') - - # Convert transaction time to datetime - df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) - - # Extract time features - df['hour'] = df['trans_date_trans_time'].dt.hour - df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek - df['month'] = df['trans_date_trans_time'].dt.month - - # Calculate age from dob - df['dob'] = pd.to_datetime(df['dob']) - df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 - - # Calculate distance between user and merchant - df['distance'] = np.sqrt( - (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + - (df['long'].astype(float) - df['merch_long'].astype(float))**2 - ) - - # Drop unnecessary columns - return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore') - - -@app.route('/') -def home(): - return render_template('index.html') - - -@app.route('/predict', methods=['POST']) -def predict(): - try: - # Get data from form - data = request.form.to_dict() - - # Preprocess the input - processed_data = preprocess_input(data) - - # Make prediction - prediction = model.predict(processed_data) - probability = model.predict_proba(processed_data)[0][1] - - result = { - 'prediction': int(prediction[0]), - 'probability': float(probability), - 'is_fraud': bool(prediction[0]) - } - - return render_template('index.html', prediction=result) - - except Exception as e: - return jsonify({'error': str(e)}), 400 - - -if __name__ == '__main__': - app.run(debug=True) diff --git a/deployment/cloud_run.sh b/deployment/cloud_run.sh deleted file mode 100644 index e69de29..0000000 diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml deleted file mode 100644 index e69de29..0000000 diff --git a/src/api/app.py b/src/api/app.py index e69de29..9f3584e 100644 --- a/src/api/app.py +++ b/src/api/app.py @@ -0,0 +1,90 @@ +from flask import Flask, render_template, request +import joblib +import pandas as pd +import numpy as np +from datetime import datetime + +app = Flask(__name__) + +# Load the model +try: + model = joblib.load('models/fraud_model.pkl') + print("Model loaded successfully") +except Exception as e: + print(f"Error loading model: {e}") + raise + +def preprocess_input(data): + # Convert to DataFrame + df = pd.DataFrame([data]) + + # Ensure numeric fields are properly converted + numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long'] + for field in numeric_fields: + df[field] = pd.to_numeric(df[field], errors='coerce') + + # Convert and extract datetime features + df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) + df['hour'] = df['trans_date_trans_time'].dt.hour + df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek + df['month'] = df['trans_date_trans_time'].dt.month + + # Calculate age from dob + df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9) + df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600) + + # Calculate distance safely + df['distance'] = np.sqrt( + (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + + (df['long'].astype(float) - df['merch_long'].astype(float))**2 + ) + + # Ensure all expected columns are present + expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance', + 'category', 'gender', 'job', 'merchant'] + for col in expected_columns: + if col not in df.columns: + df[col] = 0 # Default value if missing + + return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore') + +@app.route('/') +def home(): + return render_template('index.html') + +@app.route('/predict', methods=['POST']) +def predict(): + try: + # Get data from form + data = request.form.to_dict() + print("Received data:", data) # Debugging + + # Preprocess the input + processed_data = preprocess_input(data) + print("Processed data:", processed_data) # Debugging + + # Get prediction probabilities + probabilities = model.predict_proba(processed_data) + print("Raw probabilities:", probabilities) # Debugging + + # Extract fraud probability (class 1) + fraud_probability = probabilities[0][1] * 100 # Convert to percentage + + # Make prediction + prediction = model.predict(processed_data)[0] + + result = { + 'prediction': bool(prediction), + 'probability': float(fraud_probability), + 'is_fraud': bool(prediction) + } + + print("Prediction result:", result) # Debugging + return render_template('index.html', prediction=result) + + except Exception as e: + print("Prediction error:", str(e)) # Debugging + return f"Error: {str(e)}", 400 + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file diff --git a/src/config.py b/src/config.py index e69de29..b00f267 100644 --- a/src/config.py +++ b/src/config.py @@ -0,0 +1,64 @@ +import os +from pathlib import Path + +# Directory Paths +BASE_DIR = Path(__file__).parent.parent +DATA_DIR = BASE_DIR / 'data' +RAW_DATA_DIR = DATA_DIR / 'raw' +PROCESSED_DATA_DIR = DATA_DIR / 'processed' +MODELS_DIR = BASE_DIR / 'models' +REPORTS_DIR = BASE_DIR / 'reports' +FIGURES_DIR = REPORTS_DIR / 'figures' + +# File Paths +TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv' +TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv' +TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv' +TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv' +MODEL_PATH = MODELS_DIR / 'fraud_model.pkl' + +# Model Configuration +MODEL_PARAMS = { + 'classifier': RandomForestClassifier, + 'classifier_params': { + 'n_estimators': 100, + 'max_depth': None, + 'min_samples_split': 2, + 'class_weight': 'balanced', + 'random_state': 42, + 'n_jobs': -1 + }, + 'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'], + 'categorical_features': ['category', 'gender', 'job', 'merchant'] +} + +# Feature Engineering Configuration +FEATURE_CONFIG = { + 'time_features': ['hour', 'day_of_week', 'month'], + 'demographic_features': ['age'], + 'geographic_features': ['distance'], + 'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time'] +} + +# Evaluation Metrics +EVAL_METRICS = { + 'threshold': 0.5, + 'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'], + 'target_names': ['Legitimate', 'Fraud'] +} + +# Flask App Configuration +class FlaskConfig: + DEBUG = True + SECRET_KEY = 'your-secret-key-here' + MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB upload limit + JSONIFY_PRETTYPRINT_REGULAR = True + +# Create directories if they don't exist +for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]: + directory.mkdir(parents=True, exist_ok=True) + +# Constants +DATE_FORMAT = '%Y-%m-%d %H:%M:%S' +RANDOM_STATE = 42 +TEST_SIZE = 0.2 \ No newline at end of file diff --git a/src/data_preprocessing.py b/src/data_preprocessing.py index e69de29..066e6c2 100644 --- a/src/data_preprocessing.py +++ b/src/data_preprocessing.py @@ -0,0 +1,68 @@ +import pandas as pd +import numpy as np +from datetime import datetime + + +# Load data with proper dtype specification +def load_data(filepath): + return pd.read_csv(filepath, low_memory=False) + +train_df = load_data('data/raw/fraudTrain.csv') +test_df = load_data('data/raw/fraudTest.csv') + +# Data cleaning function +def clean_data(df): + # Fix merchant coordinates (handling malformed values) + df['merch_lat'] = pd.to_numeric(df['merch_lat'].astype(str).str.replace(r'[^\d.-]', '', regex=True), errors='coerce') + df['merch_long'] = pd.to_numeric(df['merch_long'].astype(str).str.replace(r'[^\d.-]', '', regex=True), errors='coerce') + + # Drop rows with missing values + df = df.dropna() + + # Drop duplicates + df = df.drop_duplicates() + + # Ensure proper data types + df['cc_num'] = df['cc_num'].astype(str) + df['zip'] = df['zip'].astype(str) + + return df + + +train_df = clean_data(train_df) +test_df = clean_data(test_df) + + +# Feature engineering function +def create_features(df): + # Convert datetime columns + df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) + df['dob'] = pd.to_datetime(df['dob']) + + # Time-based features + df['hour'] = df['trans_date_trans_time'].dt.hour + df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek + df['month'] = df['trans_date_trans_time'].dt.month + + # Demographic features + df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 + + # Geographic features + df['distance'] = np.sqrt( + (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + + (df['long'].astype(float) - df['merch_long'].astype(float))**2 + ) + + # Transaction frequency features + df['trans_count_last_24h'] = df.groupby('cc_num')['trans_num'].transform('count') + + return df + + +# Apply feature engineering +train_df = create_features(train_df) +test_df = create_features(test_df) + +# Save processed data +train_df.to_csv('data/processed/train_processed.csv', index=False) +test_df.to_csv('data/processed/test_processed.csv', index=False) \ No newline at end of file diff --git a/src/model_evaluation.py b/src/model_evaluation.py index e69de29..059bac5 100644 --- a/src/model_evaluation.py +++ b/src/model_evaluation.py @@ -0,0 +1,111 @@ +import pandas as pd +import numpy as np +import joblib +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import (classification_report, confusion_matrix, + roc_auc_score, precision_recall_curve, + average_precision_score, RocCurveDisplay) +from sklearn.calibration import calibration_curve +from sklearn.pipeline import Pipeline + + +# Constants for feature engineering +def load_model_and_data(model_path, test_file): + """Load trained model and test data""" + model = joblib.load(model_path) + test_df = pd.read_csv(test_file) + test_df = feature_engineering(test_df) + return model, test_df + + +def feature_engineering(df): + """Replicate feature engineering from training""" + df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) + df['hour'] = df['trans_date_trans_time'].dt.hour + df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek + df['month'] = df['trans_date_trans_time'].dt.month + df['dob'] = pd.to_datetime(df['dob']) + df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 + df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2) + return df + + +def evaluate_model(model, test_df): + """Comprehensive model evaluation""" + # Prepare test data + X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) + y_test = test_df['is_fraud'] + + # Generate predictions + y_pred = model.predict(X_test) + y_proba = model.predict_proba(X_test)[:, 1] + + # Classification Report + print("="*50) + print("Classification Report:") + print("="*50) + print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud'])) + + # Confusion Matrix + plt.figure(figsize=(8,6)) + cm = confusion_matrix(y_test, y_pred) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['Legitimate', 'Fraud'], + yticklabels=['Legitimate', 'Fraud']) + plt.title('Confusion Matrix') + plt.ylabel('Actual') + plt.xlabel('Predicted') + plt.savefig('reports/figures/confusion_matrix.png') + plt.close() + + # ROC Curve + roc_auc = roc_auc_score(y_test, y_proba) + print(f"\nROC AUC Score: {roc_auc:.4f}") + RocCurveDisplay.from_predictions(y_test, y_proba) + plt.title(f'ROC Curve (AUC = {roc_auc:.4f})') + plt.savefig('reports/figures/roc_curve.png') + plt.close() + + # Precision-Recall Curve + precision, recall, _ = precision_recall_curve(y_test, y_proba) + ap_score = average_precision_score(y_test, y_proba) + plt.figure(figsize=(8,6)) + plt.plot(recall, precision, label=f'AP = {ap_score:.4f}') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.legend() + plt.savefig('reports/figures/precision_recall_curve.png') + plt.close() + + # Calibration Curve + prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10) + plt.figure(figsize=(8,6)) + plt.plot(prob_pred, prob_true, marker='o') + plt.plot([0, 1], [0, 1], linestyle='--') + plt.xlabel('Predicted Probability') + plt.ylabel('True Probability') + plt.title('Calibration Curve') + plt.savefig('reports/figures/calibration_curve.png') + plt.close() + + +def main(): + # Create directories for reports + import os + os.makedirs('reports/figures', exist_ok=True) + + # Load model and test data + model, test_df = load_model_and_data( + 'models/fraud_model.pkl', + 'data/processed/test_processed.csv' + ) + + # Run evaluation + evaluate_model(model, test_df) + + print("\nEvaluation complete. Reports saved to reports/ directory") + +if __name__ == "__main__": + main() diff --git a/src/model_training.py b/src/model_training.py index e69de29..398ce71 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -0,0 +1,100 @@ +import pandas as pd +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +import joblib +from datetime import datetime + + +def load_data(train_file, test_file): + train_df = pd.read_csv(train_file) + test_df = pd.read_csv(test_file) + + # Drop rows with missing values + train_df.dropna(inplace=True) + test_df.dropna(inplace=True) + + return train_df, test_df + + +def feature_engineering(df): + # Convert transaction time to datetime + df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) + + # Extract time features + df['hour'] = df['trans_date_trans_time'].dt.hour + df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek + df['month'] = df['trans_date_trans_time'].dt.month + + # Calculate age from dob + df['dob'] = pd.to_datetime(df['dob']) + df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 + + # Calculate distance between user and merchant + df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2) + + return df + + +def train_model(train_df, test_df): + # Define features and target + X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) + y_train = train_df['is_fraud'] + + X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) + y_test = test_df['is_fraud'] + + # Define preprocessing + numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'] + numeric_transformer = StandardScaler() + + categorical_features = ['category', 'gender', 'job', 'merchant'] + categorical_transformer = OneHotEncoder(handle_unknown='ignore') + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) + ]) + + # Create pipeline + model = Pipeline(steps=[ + ('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42)) + ]) + + # Train model + model.fit(X_train, y_train) + + # Evaluate on test data + y_pred = model.predict(X_test) + print("Test Set Performance:") + print(classification_report(y_test, y_pred)) + + return model + + +def main(): + # Load data + train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv') + + # Feature engineering + train_df = feature_engineering(train_df) + test_df = feature_engineering(test_df) + + # Print dataset sizes after cleaning + print(f"Training samples after cleaning: {len(train_df)}") + print(f"Test samples after cleaning: {len(test_df)}") + + # Train model + model = train_model(train_df, test_df) + + # Save model + joblib.dump(model, 'models/fraud_model.pkl') + print("Model saved to models/fraud_model.pkl") + +if __name__ == "__main__": + main() diff --git a/src/web/app.py b/src/web/app.py deleted file mode 100644 index e69de29..0000000 diff --git a/static/style.css b/src/web/static/style.css similarity index 100% rename from static/style.css rename to src/web/static/style.css diff --git a/templates/index.html b/src/web/templates/index.html similarity index 100% rename from templates/index.html rename to src/web/templates/index.html diff --git a/train_model.py b/train_model.py deleted file mode 100644 index 398ce71..0000000 --- a/train_model.py +++ /dev/null @@ -1,100 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import classification_report -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline -import joblib -from datetime import datetime - - -def load_data(train_file, test_file): - train_df = pd.read_csv(train_file) - test_df = pd.read_csv(test_file) - - # Drop rows with missing values - train_df.dropna(inplace=True) - test_df.dropna(inplace=True) - - return train_df, test_df - - -def feature_engineering(df): - # Convert transaction time to datetime - df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) - - # Extract time features - df['hour'] = df['trans_date_trans_time'].dt.hour - df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek - df['month'] = df['trans_date_trans_time'].dt.month - - # Calculate age from dob - df['dob'] = pd.to_datetime(df['dob']) - df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365 - - # Calculate distance between user and merchant - df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2) - - return df - - -def train_model(train_df, test_df): - # Define features and target - X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) - y_train = train_df['is_fraud'] - - X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1) - y_test = test_df['is_fraud'] - - # Define preprocessing - numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'] - numeric_transformer = StandardScaler() - - categorical_features = ['category', 'gender', 'job', 'merchant'] - categorical_transformer = OneHotEncoder(handle_unknown='ignore') - - preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features) - ]) - - # Create pipeline - model = Pipeline(steps=[ - ('preprocessor', preprocessor), - ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42)) - ]) - - # Train model - model.fit(X_train, y_train) - - # Evaluate on test data - y_pred = model.predict(X_test) - print("Test Set Performance:") - print(classification_report(y_test, y_pred)) - - return model - - -def main(): - # Load data - train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv') - - # Feature engineering - train_df = feature_engineering(train_df) - test_df = feature_engineering(test_df) - - # Print dataset sizes after cleaning - print(f"Training samples after cleaning: {len(train_df)}") - print(f"Test samples after cleaning: {len(test_df)}") - - # Train model - model = train_model(train_df, test_df) - - # Save model - joblib.dump(model, 'models/fraud_model.pkl') - print("Model saved to models/fraud_model.pkl") - -if __name__ == "__main__": - main()