50e95445fb
Defined file structure and completed EDA
160 lines
3.3 KiB
Plaintext
160 lines
3.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2c5baf8e",
|
|
"metadata": {},
|
|
"source": [
|
|
"# 📊 Exploratory Data Analysis: Fraud Detection Dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2f3e6a97",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"\n",
|
|
"df = pd.read_csv(\"fraudTest.csv\")\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2bcadae6",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 🧾 Basic Overview of the Dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "820cb0e9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"Shape:\", df.shape)\n",
|
|
"print(\"\\nData Types:\\n\", df.dtypes)\n",
|
|
"print(\"\\nMissing Values:\\n\", df.isnull().sum())\n",
|
|
"print(\"\\nDuplicate Rows:\", df.duplicated().sum())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "caa22db9",
|
|
"metadata": {},
|
|
"source": [
|
|
"## ⚖️ Class Balance"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7fb75259",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sns.countplot(data=df, x=\"is_fraud\")\n",
|
|
"plt.title(\"Fraud vs Non-Fraud Transactions\")\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"fraud_ratio = df[\"is_fraud\"].mean()\n",
|
|
"print(f\"Fraudulent transactions: {fraud_ratio:.4%}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "658e9cd2",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 📊 Statistical Summary"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "202e2612",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "12d24a95",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 🔗 Correlation Matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3c02acf0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(12, 8))\n",
|
|
"sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=\".2f\", cmap=\"coolwarm\")\n",
|
|
"plt.title(\"Feature Correlation Matrix\")\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "fce8183a",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 💵 Transaction Amount Distribution by Fraud"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ea72b131",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(10, 6))\n",
|
|
"sns.boxplot(data=df, x='is_fraud', y='amt')\n",
|
|
"plt.yscale('log')\n",
|
|
"plt.title(\"Transaction Amount by Fraud Status\")\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a7d7d378",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 🕒 Transaction Timing (Hourly)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5f26f36f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])\n",
|
|
"df['hour'] = df['trans_date_trans_time'].dt.hour\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"sns.histplot(data=df, x='hour', hue='is_fraud', multiple='stack', bins=24)\n",
|
|
"plt.title(\"Transaction Hour Distribution\")\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|