update project structure and improve scripts
This commit is contained in:
@@ -49,6 +49,9 @@ data/processed/
|
||||
*.feather
|
||||
*.parquet
|
||||
|
||||
# Report files
|
||||
reports/
|
||||
|
||||
# Model files
|
||||
models/
|
||||
!models/README.md
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
from flask import Flask, render_template, request, jsonify
|
||||
import joblib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Load the model
|
||||
try:
|
||||
model = joblib.load('models/fraud_model.pkl')
|
||||
except Exception as e:
|
||||
print(f"Error loading model: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def preprocess_input(data):
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame([data])
|
||||
|
||||
# Convert numeric fields explicitly
|
||||
numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
|
||||
for field in numeric_fields:
|
||||
df[field] = pd.to_numeric(df[field], errors='coerce')
|
||||
|
||||
# Convert transaction time to datetime
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
|
||||
# Extract time features
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
|
||||
# Calculate age from dob
|
||||
df['dob'] = pd.to_datetime(df['dob'])
|
||||
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
||||
|
||||
# Calculate distance between user and merchant
|
||||
df['distance'] = np.sqrt(
|
||||
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
|
||||
(df['long'].astype(float) - df['merch_long'].astype(float))**2
|
||||
)
|
||||
|
||||
# Drop unnecessary columns
|
||||
return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
return render_template('index.html')
|
||||
|
||||
|
||||
@app.route('/predict', methods=['POST'])
|
||||
def predict():
|
||||
try:
|
||||
# Get data from form
|
||||
data = request.form.to_dict()
|
||||
|
||||
# Preprocess the input
|
||||
processed_data = preprocess_input(data)
|
||||
|
||||
# Make prediction
|
||||
prediction = model.predict(processed_data)
|
||||
probability = model.predict_proba(processed_data)[0][1]
|
||||
|
||||
result = {
|
||||
'prediction': int(prediction[0]),
|
||||
'probability': float(probability),
|
||||
'is_fraud': bool(prediction[0])
|
||||
}
|
||||
|
||||
return render_template('index.html', prediction=result)
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({'error': str(e)}), 400
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
@@ -0,0 +1,90 @@
|
||||
from flask import Flask, render_template, request
|
||||
import joblib
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Load the model
|
||||
try:
|
||||
model = joblib.load('models/fraud_model.pkl')
|
||||
print("Model loaded successfully")
|
||||
except Exception as e:
|
||||
print(f"Error loading model: {e}")
|
||||
raise
|
||||
|
||||
def preprocess_input(data):
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame([data])
|
||||
|
||||
# Ensure numeric fields are properly converted
|
||||
numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
|
||||
for field in numeric_fields:
|
||||
df[field] = pd.to_numeric(df[field], errors='coerce')
|
||||
|
||||
# Convert and extract datetime features
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
|
||||
# Calculate age from dob
|
||||
df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9)
|
||||
df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600)
|
||||
|
||||
# Calculate distance safely
|
||||
df['distance'] = np.sqrt(
|
||||
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
|
||||
(df['long'].astype(float) - df['merch_long'].astype(float))**2
|
||||
)
|
||||
|
||||
# Ensure all expected columns are present
|
||||
expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance',
|
||||
'category', 'gender', 'job', 'merchant']
|
||||
for col in expected_columns:
|
||||
if col not in df.columns:
|
||||
df[col] = 0 # Default value if missing
|
||||
|
||||
return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/predict', methods=['POST'])
|
||||
def predict():
|
||||
try:
|
||||
# Get data from form
|
||||
data = request.form.to_dict()
|
||||
print("Received data:", data) # Debugging
|
||||
|
||||
# Preprocess the input
|
||||
processed_data = preprocess_input(data)
|
||||
print("Processed data:", processed_data) # Debugging
|
||||
|
||||
# Get prediction probabilities
|
||||
probabilities = model.predict_proba(processed_data)
|
||||
print("Raw probabilities:", probabilities) # Debugging
|
||||
|
||||
# Extract fraud probability (class 1)
|
||||
fraud_probability = probabilities[0][1] * 100 # Convert to percentage
|
||||
|
||||
# Make prediction
|
||||
prediction = model.predict(processed_data)[0]
|
||||
|
||||
result = {
|
||||
'prediction': bool(prediction),
|
||||
'probability': float(fraud_probability),
|
||||
'is_fraud': bool(prediction)
|
||||
}
|
||||
|
||||
print("Prediction result:", result) # Debugging
|
||||
return render_template('index.html', prediction=result)
|
||||
|
||||
except Exception as e:
|
||||
print("Prediction error:", str(e)) # Debugging
|
||||
return f"Error: {str(e)}", 400
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
@@ -0,0 +1,64 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Directory Paths
|
||||
BASE_DIR = Path(__file__).parent.parent
|
||||
DATA_DIR = BASE_DIR / 'data'
|
||||
RAW_DATA_DIR = DATA_DIR / 'raw'
|
||||
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
|
||||
MODELS_DIR = BASE_DIR / 'models'
|
||||
REPORTS_DIR = BASE_DIR / 'reports'
|
||||
FIGURES_DIR = REPORTS_DIR / 'figures'
|
||||
|
||||
# File Paths
|
||||
TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
|
||||
TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
|
||||
TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
|
||||
TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
|
||||
MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
|
||||
|
||||
# Model Configuration
|
||||
MODEL_PARAMS = {
|
||||
'classifier': RandomForestClassifier,
|
||||
'classifier_params': {
|
||||
'n_estimators': 100,
|
||||
'max_depth': None,
|
||||
'min_samples_split': 2,
|
||||
'class_weight': 'balanced',
|
||||
'random_state': 42,
|
||||
'n_jobs': -1
|
||||
},
|
||||
'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'],
|
||||
'categorical_features': ['category', 'gender', 'job', 'merchant']
|
||||
}
|
||||
|
||||
# Feature Engineering Configuration
|
||||
FEATURE_CONFIG = {
|
||||
'time_features': ['hour', 'day_of_week', 'month'],
|
||||
'demographic_features': ['age'],
|
||||
'geographic_features': ['distance'],
|
||||
'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time']
|
||||
}
|
||||
|
||||
# Evaluation Metrics
|
||||
EVAL_METRICS = {
|
||||
'threshold': 0.5,
|
||||
'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'],
|
||||
'target_names': ['Legitimate', 'Fraud']
|
||||
}
|
||||
|
||||
# Flask App Configuration
|
||||
class FlaskConfig:
|
||||
DEBUG = True
|
||||
SECRET_KEY = 'your-secret-key-here'
|
||||
MAX_CONTENT_LENGTH = 16 * 1024 * 1024 # 16MB upload limit
|
||||
JSONIFY_PRETTYPRINT_REGULAR = True
|
||||
|
||||
# Create directories if they don't exist
|
||||
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]:
|
||||
directory.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Constants
|
||||
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
|
||||
RANDOM_STATE = 42
|
||||
TEST_SIZE = 0.2
|
||||
@@ -0,0 +1,68 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
# Load data with proper dtype specification
|
||||
def load_data(filepath):
|
||||
return pd.read_csv(filepath, low_memory=False)
|
||||
|
||||
train_df = load_data('data/raw/fraudTrain.csv')
|
||||
test_df = load_data('data/raw/fraudTest.csv')
|
||||
|
||||
# Data cleaning function
|
||||
def clean_data(df):
|
||||
# Fix merchant coordinates (handling malformed values)
|
||||
df['merch_lat'] = pd.to_numeric(df['merch_lat'].astype(str).str.replace(r'[^\d.-]', '', regex=True), errors='coerce')
|
||||
df['merch_long'] = pd.to_numeric(df['merch_long'].astype(str).str.replace(r'[^\d.-]', '', regex=True), errors='coerce')
|
||||
|
||||
# Drop rows with missing values
|
||||
df = df.dropna()
|
||||
|
||||
# Drop duplicates
|
||||
df = df.drop_duplicates()
|
||||
|
||||
# Ensure proper data types
|
||||
df['cc_num'] = df['cc_num'].astype(str)
|
||||
df['zip'] = df['zip'].astype(str)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
train_df = clean_data(train_df)
|
||||
test_df = clean_data(test_df)
|
||||
|
||||
|
||||
# Feature engineering function
|
||||
def create_features(df):
|
||||
# Convert datetime columns
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
df['dob'] = pd.to_datetime(df['dob'])
|
||||
|
||||
# Time-based features
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
|
||||
# Demographic features
|
||||
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
||||
|
||||
# Geographic features
|
||||
df['distance'] = np.sqrt(
|
||||
(df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
|
||||
(df['long'].astype(float) - df['merch_long'].astype(float))**2
|
||||
)
|
||||
|
||||
# Transaction frequency features
|
||||
df['trans_count_last_24h'] = df.groupby('cc_num')['trans_num'].transform('count')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
# Apply feature engineering
|
||||
train_df = create_features(train_df)
|
||||
test_df = create_features(test_df)
|
||||
|
||||
# Save processed data
|
||||
train_df.to_csv('data/processed/train_processed.csv', index=False)
|
||||
test_df.to_csv('data/processed/test_processed.csv', index=False)
|
||||
@@ -0,0 +1,111 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import joblib
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.metrics import (classification_report, confusion_matrix,
|
||||
roc_auc_score, precision_recall_curve,
|
||||
average_precision_score, RocCurveDisplay)
|
||||
from sklearn.calibration import calibration_curve
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
|
||||
# Constants for feature engineering
|
||||
def load_model_and_data(model_path, test_file):
|
||||
"""Load trained model and test data"""
|
||||
model = joblib.load(model_path)
|
||||
test_df = pd.read_csv(test_file)
|
||||
test_df = feature_engineering(test_df)
|
||||
return model, test_df
|
||||
|
||||
|
||||
def feature_engineering(df):
|
||||
"""Replicate feature engineering from training"""
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
df['dob'] = pd.to_datetime(df['dob'])
|
||||
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
||||
df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
|
||||
return df
|
||||
|
||||
|
||||
def evaluate_model(model, test_df):
|
||||
"""Comprehensive model evaluation"""
|
||||
# Prepare test data
|
||||
X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
||||
y_test = test_df['is_fraud']
|
||||
|
||||
# Generate predictions
|
||||
y_pred = model.predict(X_test)
|
||||
y_proba = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
# Classification Report
|
||||
print("="*50)
|
||||
print("Classification Report:")
|
||||
print("="*50)
|
||||
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))
|
||||
|
||||
# Confusion Matrix
|
||||
plt.figure(figsize=(8,6))
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=['Legitimate', 'Fraud'],
|
||||
yticklabels=['Legitimate', 'Fraud'])
|
||||
plt.title('Confusion Matrix')
|
||||
plt.ylabel('Actual')
|
||||
plt.xlabel('Predicted')
|
||||
plt.savefig('reports/figures/confusion_matrix.png')
|
||||
plt.close()
|
||||
|
||||
# ROC Curve
|
||||
roc_auc = roc_auc_score(y_test, y_proba)
|
||||
print(f"\nROC AUC Score: {roc_auc:.4f}")
|
||||
RocCurveDisplay.from_predictions(y_test, y_proba)
|
||||
plt.title(f'ROC Curve (AUC = {roc_auc:.4f})')
|
||||
plt.savefig('reports/figures/roc_curve.png')
|
||||
plt.close()
|
||||
|
||||
# Precision-Recall Curve
|
||||
precision, recall, _ = precision_recall_curve(y_test, y_proba)
|
||||
ap_score = average_precision_score(y_test, y_proba)
|
||||
plt.figure(figsize=(8,6))
|
||||
plt.plot(recall, precision, label=f'AP = {ap_score:.4f}')
|
||||
plt.xlabel('Recall')
|
||||
plt.ylabel('Precision')
|
||||
plt.title('Precision-Recall Curve')
|
||||
plt.legend()
|
||||
plt.savefig('reports/figures/precision_recall_curve.png')
|
||||
plt.close()
|
||||
|
||||
# Calibration Curve
|
||||
prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)
|
||||
plt.figure(figsize=(8,6))
|
||||
plt.plot(prob_pred, prob_true, marker='o')
|
||||
plt.plot([0, 1], [0, 1], linestyle='--')
|
||||
plt.xlabel('Predicted Probability')
|
||||
plt.ylabel('True Probability')
|
||||
plt.title('Calibration Curve')
|
||||
plt.savefig('reports/figures/calibration_curve.png')
|
||||
plt.close()
|
||||
|
||||
|
||||
def main():
|
||||
# Create directories for reports
|
||||
import os
|
||||
os.makedirs('reports/figures', exist_ok=True)
|
||||
|
||||
# Load model and test data
|
||||
model, test_df = load_model_and_data(
|
||||
'models/fraud_model.pkl',
|
||||
'data/processed/test_processed.csv'
|
||||
)
|
||||
|
||||
# Run evaluation
|
||||
evaluate_model(model, test_df)
|
||||
|
||||
print("\nEvaluation complete. Reports saved to reports/ directory")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
import joblib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_data(train_file, test_file):
|
||||
train_df = pd.read_csv(train_file)
|
||||
test_df = pd.read_csv(test_file)
|
||||
|
||||
# Drop rows with missing values
|
||||
train_df.dropna(inplace=True)
|
||||
test_df.dropna(inplace=True)
|
||||
|
||||
return train_df, test_df
|
||||
|
||||
|
||||
def feature_engineering(df):
|
||||
# Convert transaction time to datetime
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
|
||||
# Extract time features
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
|
||||
# Calculate age from dob
|
||||
df['dob'] = pd.to_datetime(df['dob'])
|
||||
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
||||
|
||||
# Calculate distance between user and merchant
|
||||
df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def train_model(train_df, test_df):
|
||||
# Define features and target
|
||||
X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
||||
y_train = train_df['is_fraud']
|
||||
|
||||
X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
||||
y_test = test_df['is_fraud']
|
||||
|
||||
# Define preprocessing
|
||||
numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance']
|
||||
numeric_transformer = StandardScaler()
|
||||
|
||||
categorical_features = ['category', 'gender', 'job', 'merchant']
|
||||
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', numeric_transformer, numeric_features),
|
||||
('cat', categorical_transformer, categorical_features)
|
||||
])
|
||||
|
||||
# Create pipeline
|
||||
model = Pipeline(steps=[
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
|
||||
])
|
||||
|
||||
# Train model
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Evaluate on test data
|
||||
y_pred = model.predict(X_test)
|
||||
print("Test Set Performance:")
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def main():
|
||||
# Load data
|
||||
train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
|
||||
|
||||
# Feature engineering
|
||||
train_df = feature_engineering(train_df)
|
||||
test_df = feature_engineering(test_df)
|
||||
|
||||
# Print dataset sizes after cleaning
|
||||
print(f"Training samples after cleaning: {len(train_df)}")
|
||||
print(f"Test samples after cleaning: {len(test_df)}")
|
||||
|
||||
# Train model
|
||||
model = train_model(train_df, test_df)
|
||||
|
||||
# Save model
|
||||
joblib.dump(model, 'models/fraud_model.pkl')
|
||||
print("Model saved to models/fraud_model.pkl")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
-100
@@ -1,100 +0,0 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
import joblib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def load_data(train_file, test_file):
|
||||
train_df = pd.read_csv(train_file)
|
||||
test_df = pd.read_csv(test_file)
|
||||
|
||||
# Drop rows with missing values
|
||||
train_df.dropna(inplace=True)
|
||||
test_df.dropna(inplace=True)
|
||||
|
||||
return train_df, test_df
|
||||
|
||||
|
||||
def feature_engineering(df):
|
||||
# Convert transaction time to datetime
|
||||
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
|
||||
|
||||
# Extract time features
|
||||
df['hour'] = df['trans_date_trans_time'].dt.hour
|
||||
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
|
||||
df['month'] = df['trans_date_trans_time'].dt.month
|
||||
|
||||
# Calculate age from dob
|
||||
df['dob'] = pd.to_datetime(df['dob'])
|
||||
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
|
||||
|
||||
# Calculate distance between user and merchant
|
||||
df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def train_model(train_df, test_df):
|
||||
# Define features and target
|
||||
X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
||||
y_train = train_df['is_fraud']
|
||||
|
||||
X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
|
||||
y_test = test_df['is_fraud']
|
||||
|
||||
# Define preprocessing
|
||||
numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance']
|
||||
numeric_transformer = StandardScaler()
|
||||
|
||||
categorical_features = ['category', 'gender', 'job', 'merchant']
|
||||
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', numeric_transformer, numeric_features),
|
||||
('cat', categorical_transformer, categorical_features)
|
||||
])
|
||||
|
||||
# Create pipeline
|
||||
model = Pipeline(steps=[
|
||||
('preprocessor', preprocessor),
|
||||
('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
|
||||
])
|
||||
|
||||
# Train model
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Evaluate on test data
|
||||
y_pred = model.predict(X_test)
|
||||
print("Test Set Performance:")
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def main():
|
||||
# Load data
|
||||
train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
|
||||
|
||||
# Feature engineering
|
||||
train_df = feature_engineering(train_df)
|
||||
test_df = feature_engineering(test_df)
|
||||
|
||||
# Print dataset sizes after cleaning
|
||||
print(f"Training samples after cleaning: {len(train_df)}")
|
||||
print(f"Test samples after cleaning: {len(test_df)}")
|
||||
|
||||
# Train model
|
||||
model = train_model(train_df, test_df)
|
||||
|
||||
# Save model
|
||||
joblib.dump(model, 'models/fraud_model.pkl')
|
||||
print("Model saved to models/fraud_model.pkl")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user