update project structure and improve scripts

2025-07-25 20:10:44 +01:00
parent b058aaf8fc
commit 6c1224eaca
14 changed files with 436 additions and 179 deletions
@@ -49,6 +49,9 @@ data/processed/
 *.feather
 *.parquet

+# Report files
+reports/
+
 # Model files
 models/
 !models/README.md
@@ -1,79 +0,0 @@
-from flask import Flask, render_template, request, jsonify
-import joblib
-import pandas as pd
-import numpy as np
-from datetime import datetime
-
-app = Flask(__name__)
-
-# Load the model
-try:
-    model = joblib.load('models/fraud_model.pkl')
-except Exception as e:
-    print(f"Error loading model: {e}")
-    raise
-
-
-def preprocess_input(data):
-    # Convert to DataFrame
-    df = pd.DataFrame([data])
-
-    # Convert numeric fields explicitly
-    numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
-    for field in numeric_fields:
-        df[field] = pd.to_numeric(df[field], errors='coerce')
-
-    # Convert transaction time to datetime
-    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
-
-    # Extract time features
-    df['hour'] = df['trans_date_trans_time'].dt.hour
-    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
-    df['month'] = df['trans_date_trans_time'].dt.month
-
-    # Calculate age from dob
-    df['dob'] = pd.to_datetime(df['dob'])
-    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
-
-    # Calculate distance between user and merchant
-    df['distance'] = np.sqrt(
-        (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + 
-        (df['long'].astype(float) - df['merch_long'].astype(float))**2
-    )
-
-    # Drop unnecessary columns
-    return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
-
-
-@app.route('/')
-def home():
-    return render_template('index.html')
-
-
-@app.route('/predict', methods=['POST'])
-def predict():
-    try:
-        # Get data from form
-        data = request.form.to_dict()
-
-        # Preprocess the input
-        processed_data = preprocess_input(data)
-
-        # Make prediction
-        prediction = model.predict(processed_data)
-        probability = model.predict_proba(processed_data)[0][1]
-
-        result = {
-            'prediction': int(prediction[0]),
-            'probability': float(probability),
-            'is_fraud': bool(prediction[0])
-        }
-
-        return render_template('index.html', prediction=result)
-
-    except Exception as e:
-        return jsonify({'error': str(e)}), 400
-
-
-if __name__ == '__main__':
-    app.run(debug=True)
@@ -0,0 +1,90 @@
+from flask import Flask, render_template, request
+import joblib
+import pandas as pd
+import numpy as np
+from datetime import datetime
+
+app = Flask(__name__)
+
+# Load the model
+try:
+    model = joblib.load('models/fraud_model.pkl')
+    print("Model loaded successfully")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    raise
+
+def preprocess_input(data):
+    # Convert to DataFrame
+    df = pd.DataFrame([data])
+    
+    # Ensure numeric fields are properly converted
+    numeric_fields = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
+    for field in numeric_fields:
+        df[field] = pd.to_numeric(df[field], errors='coerce')
+    
+    # Convert and extract datetime features
+    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+    df['hour'] = df['trans_date_trans_time'].dt.hour
+    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
+    df['month'] = df['trans_date_trans_time'].dt.month
+    
+    # Calculate age from dob
+    df['dob'] = pd.to_numeric(pd.to_datetime(df['dob']).astype(np.int64) / 10**9)
+    df['age'] = (pd.Timestamp.now().timestamp() - df['dob']) / (365 * 24 * 3600)
+    
+    # Calculate distance safely
+    df['distance'] = np.sqrt(
+        (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 + 
+        (df['long'].astype(float) - df['merch_long'].astype(float))**2
+    )
+    
+    # Ensure all expected columns are present
+    expected_columns = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance',
+                       'category', 'gender', 'job', 'merchant']
+    for col in expected_columns:
+        if col not in df.columns:
+            df[col] = 0  # Default value if missing
+    
+    return df.drop(['trans_date_trans_time', 'dob'], axis=1, errors='ignore')
+
+@app.route('/')
+def home():
+    return render_template('index.html')
+
+@app.route('/predict', methods=['POST'])
+def predict():
+    try:
+        # Get data from form
+        data = request.form.to_dict()
+        print("Received data:", data)  # Debugging
+        
+        # Preprocess the input
+        processed_data = preprocess_input(data)
+        print("Processed data:", processed_data)  # Debugging
+        
+        # Get prediction probabilities
+        probabilities = model.predict_proba(processed_data)
+        print("Raw probabilities:", probabilities)  # Debugging
+        
+        # Extract fraud probability (class 1)
+        fraud_probability = probabilities[0][1] * 100  # Convert to percentage
+        
+        # Make prediction
+        prediction = model.predict(processed_data)[0]
+        
+        result = {
+            'prediction': bool(prediction),
+            'probability': float(fraud_probability),
+            'is_fraud': bool(prediction)
+        }
+        
+        print("Prediction result:", result)  # Debugging
+        return render_template('index.html', prediction=result)
+    
+    except Exception as e:
+        print("Prediction error:", str(e))  # Debugging
+        return f"Error: {str(e)}", 400
+
+if __name__ == '__main__':
+    app.run(debug=True)
@@ -0,0 +1,64 @@
+import os
+from pathlib import Path
+
+# Directory Paths
+BASE_DIR = Path(__file__).parent.parent
+DATA_DIR = BASE_DIR / 'data'
+RAW_DATA_DIR = DATA_DIR / 'raw'
+PROCESSED_DATA_DIR = DATA_DIR / 'processed'
+MODELS_DIR = BASE_DIR / 'models'
+REPORTS_DIR = BASE_DIR / 'reports'
+FIGURES_DIR = REPORTS_DIR / 'figures'
+
+# File Paths
+TRAIN_DATA_PATH = RAW_DATA_DIR / 'fraudTrain.csv'
+TEST_DATA_PATH = RAW_DATA_DIR / 'fraudTest.csv'
+TRAIN_PROCESSED_PATH = PROCESSED_DATA_DIR / 'train_processed.csv'
+TEST_PROCESSED_PATH = PROCESSED_DATA_DIR / 'test_processed.csv'
+MODEL_PATH = MODELS_DIR / 'fraud_model.pkl'
+
+# Model Configuration
+MODEL_PARAMS = {
+    'classifier': RandomForestClassifier,
+    'classifier_params': {
+        'n_estimators': 100,
+        'max_depth': None,
+        'min_samples_split': 2,
+        'class_weight': 'balanced',
+        'random_state': 42,
+        'n_jobs': -1
+    },
+    'numeric_features': ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance'],
+    'categorical_features': ['category', 'gender', 'job', 'merchant']
+}
+
+# Feature Engineering Configuration
+FEATURE_CONFIG = {
+    'time_features': ['hour', 'day_of_week', 'month'],
+    'demographic_features': ['age'],
+    'geographic_features': ['distance'],
+    'drop_columns': ['trans_date_trans_time', 'trans_num', 'dob', 'unix_time']
+}
+
+# Evaluation Metrics
+EVAL_METRICS = {
+    'threshold': 0.5,
+    'metrics': ['precision', 'recall', 'f1', 'roc_auc', 'average_precision'],
+    'target_names': ['Legitimate', 'Fraud']
+}
+
+# Flask App Configuration
+class FlaskConfig:
+    DEBUG = True
+    SECRET_KEY = 'your-secret-key-here'
+    MAX_CONTENT_LENGTH = 16 * 1024 * 1024  # 16MB upload limit
+    JSONIFY_PRETTYPRINT_REGULAR = True
+
+# Create directories if they don't exist
+for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, FIGURES_DIR]:
+    directory.mkdir(parents=True, exist_ok=True)
+
+# Constants
+DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
+RANDOM_STATE = 42
+TEST_SIZE = 0.2
@@ -0,0 +1,68 @@
+import pandas as pd
+import numpy as np
+from datetime import datetime
+
+
+# Load data with proper dtype specification
+def load_data(filepath):
+    return pd.read_csv(filepath, low_memory=False)
+
+train_df = load_data('data/raw/fraudTrain.csv')
+test_df = load_data('data/raw/fraudTest.csv')
+
+# Data cleaning function
+def clean_data(df):
+    # Fix merchant coordinates (handling malformed values)
+    df['merch_lat'] = pd.to_numeric(df['merch_lat'].astype(str).str.replace(r'[^\d.-]', '', regex=True), errors='coerce')
+    df['merch_long'] = pd.to_numeric(df['merch_long'].astype(str).str.replace(r'[^\d.-]', '', regex=True), errors='coerce')
+
+    # Drop rows with missing values
+    df = df.dropna()
+
+    # Drop duplicates
+    df = df.drop_duplicates()
+
+    # Ensure proper data types
+    df['cc_num'] = df['cc_num'].astype(str)
+    df['zip'] = df['zip'].astype(str)
+
+    return df
+
+
+train_df = clean_data(train_df)
+test_df = clean_data(test_df)
+
+
+# Feature engineering function
+def create_features(df):
+    # Convert datetime columns
+    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+    df['dob'] = pd.to_datetime(df['dob'])
+
+    # Time-based features
+    df['hour'] = df['trans_date_trans_time'].dt.hour
+    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
+    df['month'] = df['trans_date_trans_time'].dt.month
+
+    # Demographic features
+    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
+
+    # Geographic features
+    df['distance'] = np.sqrt(
+        (df['lat'].astype(float) - df['merch_lat'].astype(float))**2 +
+        (df['long'].astype(float) - df['merch_long'].astype(float))**2
+    )
+
+    # Transaction frequency features
+    df['trans_count_last_24h'] = df.groupby('cc_num')['trans_num'].transform('count')
+
+    return df
+
+
+# Apply feature engineering
+train_df = create_features(train_df)
+test_df = create_features(test_df)
+
+# Save processed data
+train_df.to_csv('data/processed/train_processed.csv', index=False)
+test_df.to_csv('data/processed/test_processed.csv', index=False)
@@ -0,0 +1,111 @@
+import pandas as pd
+import numpy as np
+import joblib
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import (classification_report, confusion_matrix, 
+                            roc_auc_score, precision_recall_curve, 
+                            average_precision_score, RocCurveDisplay)
+from sklearn.calibration import calibration_curve
+from sklearn.pipeline import Pipeline
+
+
+# Constants for feature engineering
+def load_model_and_data(model_path, test_file):
+    """Load trained model and test data"""
+    model = joblib.load(model_path)
+    test_df = pd.read_csv(test_file)
+    test_df = feature_engineering(test_df)
+    return model, test_df
+
+
+def feature_engineering(df):
+    """Replicate feature engineering from training"""
+    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+    df['hour'] = df['trans_date_trans_time'].dt.hour
+    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
+    df['month'] = df['trans_date_trans_time'].dt.month
+    df['dob'] = pd.to_datetime(df['dob'])
+    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
+    df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
+    return df
+
+
+def evaluate_model(model, test_df):
+    """Comprehensive model evaluation"""
+    # Prepare test data
+    X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
+    y_test = test_df['is_fraud']
+    
+    # Generate predictions
+    y_pred = model.predict(X_test)
+    y_proba = model.predict_proba(X_test)[:, 1]
+    
+    # Classification Report
+    print("="*50)
+    print("Classification Report:")
+    print("="*50)
+    print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))
+    
+    # Confusion Matrix
+    plt.figure(figsize=(8,6))
+    cm = confusion_matrix(y_test, y_pred)
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
+                xticklabels=['Legitimate', 'Fraud'],
+                yticklabels=['Legitimate', 'Fraud'])
+    plt.title('Confusion Matrix')
+    plt.ylabel('Actual')
+    plt.xlabel('Predicted')
+    plt.savefig('reports/figures/confusion_matrix.png')
+    plt.close()
+    
+    # ROC Curve
+    roc_auc = roc_auc_score(y_test, y_proba)
+    print(f"\nROC AUC Score: {roc_auc:.4f}")
+    RocCurveDisplay.from_predictions(y_test, y_proba)
+    plt.title(f'ROC Curve (AUC = {roc_auc:.4f})')
+    plt.savefig('reports/figures/roc_curve.png')
+    plt.close()
+    
+    # Precision-Recall Curve
+    precision, recall, _ = precision_recall_curve(y_test, y_proba)
+    ap_score = average_precision_score(y_test, y_proba)
+    plt.figure(figsize=(8,6))
+    plt.plot(recall, precision, label=f'AP = {ap_score:.4f}')
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.title('Precision-Recall Curve')
+    plt.legend()
+    plt.savefig('reports/figures/precision_recall_curve.png')
+    plt.close()
+    
+    # Calibration Curve
+    prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=10)
+    plt.figure(figsize=(8,6))
+    plt.plot(prob_pred, prob_true, marker='o')
+    plt.plot([0, 1], [0, 1], linestyle='--')
+    plt.xlabel('Predicted Probability')
+    plt.ylabel('True Probability')
+    plt.title('Calibration Curve')
+    plt.savefig('reports/figures/calibration_curve.png')
+    plt.close()
+
+
+def main():
+    # Create directories for reports
+    import os
+    os.makedirs('reports/figures', exist_ok=True)
+
+    # Load model and test data
+    model, test_df = load_model_and_data(
+        'models/fraud_model.pkl',
+        'data/processed/test_processed.csv'
+    )
+
+    # Run evaluation
+    evaluate_model(model, test_df)
+    
+    print("\nEvaluation complete. Reports saved to reports/ directory")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,100 @@
+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+import joblib
+from datetime import datetime
+
+
+def load_data(train_file, test_file):
+    train_df = pd.read_csv(train_file)
+    test_df = pd.read_csv(test_file)
+
+    # Drop rows with missing values
+    train_df.dropna(inplace=True)
+    test_df.dropna(inplace=True)
+
+    return train_df, test_df
+
+
+def feature_engineering(df):
+    # Convert transaction time to datetime
+    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
+
+    # Extract time features
+    df['hour'] = df['trans_date_trans_time'].dt.hour
+    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
+    df['month'] = df['trans_date_trans_time'].dt.month
+
+    # Calculate age from dob
+    df['dob'] = pd.to_datetime(df['dob'])
+    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
+
+    # Calculate distance between user and merchant
+    df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
+
+    return df
+
+
+def train_model(train_df, test_df):
+    # Define features and target
+    X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
+    y_train = train_df['is_fraud']
+
+    X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
+    y_test = test_df['is_fraud']
+
+    # Define preprocessing
+    numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance']
+    numeric_transformer = StandardScaler()
+
+    categorical_features = ['category', 'gender', 'job', 'merchant']
+    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
+
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, numeric_features),
+            ('cat', categorical_transformer, categorical_features)
+        ])
+
+    # Create pipeline
+    model = Pipeline(steps=[
+        ('preprocessor', preprocessor),
+        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
+    ])
+
+    # Train model
+    model.fit(X_train, y_train)
+
+    # Evaluate on test data
+    y_pred = model.predict(X_test)
+    print("Test Set Performance:")
+    print(classification_report(y_test, y_pred))
+
+    return model
+
+
+def main():
+    # Load data
+    train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
+
+    # Feature engineering
+    train_df = feature_engineering(train_df)
+    test_df = feature_engineering(test_df)
+
+    # Print dataset sizes after cleaning
+    print(f"Training samples after cleaning: {len(train_df)}")
+    print(f"Test samples after cleaning: {len(test_df)}")
+
+    # Train model
+    model = train_model(train_df, test_df)
+
+    # Save model
+    joblib.dump(model, 'models/fraud_model.pkl')
+    print("Model saved to models/fraud_model.pkl")
+
+if __name__ == "__main__":
+    main()
@@ -1,100 +0,0 @@
-import pandas as pd
-import numpy as np
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-import joblib
-from datetime import datetime
-
-
-def load_data(train_file, test_file):
-    train_df = pd.read_csv(train_file)
-    test_df = pd.read_csv(test_file)
-
-    # Drop rows with missing values
-    train_df.dropna(inplace=True)
-    test_df.dropna(inplace=True)
-
-    return train_df, test_df
-
-
-def feature_engineering(df):
-    # Convert transaction time to datetime
-    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
-
-    # Extract time features
-    df['hour'] = df['trans_date_trans_time'].dt.hour
-    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
-    df['month'] = df['trans_date_trans_time'].dt.month
-
-    # Calculate age from dob
-    df['dob'] = pd.to_datetime(df['dob'])
-    df['age'] = (pd.to_datetime('today') - df['dob']).dt.days // 365
-
-    # Calculate distance between user and merchant
-    df['distance'] = np.sqrt((df['lat']-df['merch_lat'])**2 + (df['long']-df['merch_long'])**2)
-
-    return df
-
-
-def train_model(train_df, test_df):
-    # Define features and target
-    X_train = train_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
-    y_train = train_df['is_fraud']
-
-    X_test = test_df.drop(['is_fraud', 'trans_date_trans_time', 'trans_num', 'dob', 'unix_time'], axis=1)
-    y_test = test_df['is_fraud']
-
-    # Define preprocessing
-    numeric_features = ['amt', 'city_pop', 'hour', 'day_of_week', 'month', 'age', 'distance']
-    numeric_transformer = StandardScaler()
-
-    categorical_features = ['category', 'gender', 'job', 'merchant']
-    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
-
-    preprocessor = ColumnTransformer(
-        transformers=[
-            ('num', numeric_transformer, numeric_features),
-            ('cat', categorical_transformer, categorical_features)
-        ])
-
-    # Create pipeline
-    model = Pipeline(steps=[
-        ('preprocessor', preprocessor),
-        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
-    ])
-
-    # Train model
-    model.fit(X_train, y_train)
-
-    # Evaluate on test data
-    y_pred = model.predict(X_test)
-    print("Test Set Performance:")
-    print(classification_report(y_test, y_pred))
-
-    return model
-
-
-def main():
-    # Load data
-    train_df, test_df = load_data('data/raw/fraudTrain.csv', 'data/raw/fraudTest.csv')
-
-    # Feature engineering
-    train_df = feature_engineering(train_df)
-    test_df = feature_engineering(test_df)
-
-    # Print dataset sizes after cleaning
-    print(f"Training samples after cleaning: {len(train_df)}")
-    print(f"Test samples after cleaning: {len(test_df)}")
-
-    # Train model
-    model = train_model(train_df, test_df)
-
-    # Save model
-    joblib.dump(model, 'models/fraud_model.pkl')
-    print("Model saved to models/fraud_model.pkl")
-
-if __name__ == "__main__":
-    main()