diff --git a/data/processed/assessment_prediction/testid/output.csv b/data/processed/assessment_prediction/testid/output.csv
index fe2c484..3320ae3 100644
--- a/data/processed/assessment_prediction/testid/output.csv
+++ b/data/processed/assessment_prediction/testid/output.csv
@@ -1,6 +1,13 @@
-start_date,end_date,open_items,red_flags,num_employees,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,time_since_last_event,percentage_change_open_items
-2023-01-01,2023-01-02,10,2,30,0,0,1,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
-2023-01-08,2023-01-09,12,1,25,1,0,0,10.0,0.0,0.0,10.0,12.0,0.0,7.0,19.999999999999996
-2023-01-15,2023-01-16,11,3,28,0,1,0,0.0,12.0,0.0,10.0,12.0,11.0,7.0,-8.333333333333337
-2023-01-22,2023-01-23,9,1,30,0,0,1,0.0,0.0,11.0,9.0,12.0,11.0,7.0,-18.181818181818176
-2023-01-29,2023-01-30,13,4,27,1,0,0,9.0,0.0,0.0,9.0,13.0,11.0,7.0,44.44444444444444
+open_items,red_flags,num_employees,duration,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,percentage_change_open_items
+10,2,30,1,0,0,1,0.0,0.0,0.0,10.0,0.0,0.0,0.0
+12,1,25,1,1,0,0,10.0,0.0,0.0,10.0,12.0,0.0,19.999999999999996
+11,3,28,1,0,1,0,0.0,12.0,0.0,10.0,12.0,11.0,-8.333333333333337
+9,1,30,1,0,0,1,0.0,0.0,11.0,9.0,12.0,11.0,-18.181818181818176
+13,4,27,1,1,0,0,9.0,0.0,0.0,9.0,13.0,11.0,44.44444444444444
+14,2,26,1,0,0,1,0.0,13.0,0.0,11.5,13.0,0.0,7.692307692307687
+15,1,31,1,0,1,0,14.0,0.0,0.0,14.0,13.0,15.0,7.14285714285714
+16,3,29,1,1,0,0,0.0,0.0,15.0,14.0,16.0,15.0,6.666666666666665
+12,2,25,1,0,0,1,0.0,16.0,0.0,12.0,16.0,15.0,-25.0
+11,1,30,1,0,1,0,12.0,0.0,0.0,12.0,16.0,11.0,-8.333333333333337
+10,4,27,1,0,0,1,0.0,0.0,11.0,11.0,0.0,11.0,-9.090909090909093
+9,3,26,1,1,0,0,10.0,0.0,0.0,10.0,9.0,11.0,-9.999999999999998
diff --git a/data/raw/erp_assessment_prediction/testid_raw_data.csv b/data/raw/erp_assessment_prediction/testid_raw_data.csv
new file mode 100644
index 0000000..0474ab6
--- /dev/null
+++ b/data/raw/erp_assessment_prediction/testid_raw_data.csv
@@ -0,0 +1,13 @@
+start_date,end_date,open_items,red_flags,num_employees,assessment_type
+2023-01-01,2023-01-02,10,2,30,weekly
+2023-01-08,2023-01-09,12,1,25,biweekly
+2023-01-15,2023-01-16,11,3,28,quarterly
+2023-01-22,2023-01-23,9,1,30,weekly
+2023-01-29,2023-01-30,13,4,27,biweekly
+2023-02-05,2023-02-06,14,2,26,weekly
+2023-02-12,2023-02-13,15,1,31,quarterly
+2023-02-19,2023-02-20,16,3,29,biweekly
+2023-02-26,2023-02-27,12,2,25,weekly
+2023-03-05,2023-03-06,11,1,30,quarterly
+2023-03-12,2023-03-13,10,4,27,weekly
+2023-03-19,2023-03-20,9,3,26,biweekly
diff --git a/models/assessment_prediction/testid/testid_latest_data.csv b/models/assessment_prediction/testid/testid_latest_data.csv
new file mode 100644
index 0000000..df6e4c4
--- /dev/null
+++ b/models/assessment_prediction/testid/testid_latest_data.csv
@@ -0,0 +1,2 @@
+open_items,red_flags,num_employees,duration,assessment_type_biweekly,assessment_type_quarterly,assessment_type_weekly,open_items_assessment_type_weekly_lag_1,open_items_assessment_type_biweekly_lag_1,open_items_assessment_type_quarterly_lag_1,open_items_weekly_ma_3,open_items_biweekly_ma_3,open_items_quarterly_ma_3,percentage_change_open_items
+9,3,26,1,1,0,0,10.0,0.0,0.0,10.0,9.0,11.0,-9.999999999999998
diff --git a/models/assessment_prediction/testid/testid_model.pkl b/models/assessment_prediction/testid/testid_model.pkl
new file mode 100644
index 0000000..cb53fa1
Binary files /dev/null and b/models/assessment_prediction/testid/testid_model.pkl differ
diff --git a/notebooks/test b/notebooks/test
new file mode 100644
index 0000000..faf8e7d
--- /dev/null
+++ b/notebooks/test
@@ -0,0 +1,13 @@
+,start_date,end_date,open_items,red_flags,num_employees,assessment_type
+0,2023-01-01,2023-01-02,10,2,30,weekly
+1,2023-01-08,2023-01-09,12,1,25,biweekly
+2,2023-01-15,2023-01-16,11,3,28,quarterly
+3,2023-01-22,2023-01-23,9,1,30,weekly
+4,2023-01-29,2023-01-30,13,4,27,biweekly
+5,2023-02-05,2023-02-06,14,2,26,weekly
+6,2023-02-12,2023-02-13,15,1,31,quarterly
+7,2023-02-19,2023-02-20,16,3,29,biweekly
+8,2023-02-26,2023-02-27,12,2,25,weekly
+9,2023-03-05,2023-03-06,11,1,30,quarterly
+10,2023-03-12,2023-03-13,10,4,27,weekly
+11,2023-03-19,2023-03-20,9,3,26,biweekly
diff --git a/notebooks/test_prediction_pipeline.ipynb b/notebooks/test_prediction_pipeline.ipynb
index e69de29..50a7fab 100644
--- a/notebooks/test_prediction_pipeline.ipynb
+++ b/notebooks/test_prediction_pipeline.ipynb
@@ -0,0 +1,1431 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a dummy dataset with past 5 assessments\n",
+ "import pandas as pd\n",
+ "data_dummy = {\n",
+ " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
+ " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
+ " 'open_items': [10, 12, 11, 9, 13],\n",
+ " 'red_flags': [2, 1, 3, 1, 4],\n",
+ " 'num_employees': [30, 25, 28, 30, 27],\n",
+ " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
+ "}\n",
+ "\n",
+ "df_dummy = pd.DataFrame(data_dummy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " open_items | \n",
+ " red_flags | \n",
+ " num_employees | \n",
+ " assessment_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 | \n",
+ " 2023-01-02 | \n",
+ " 10 | \n",
+ " 2 | \n",
+ " 30 | \n",
+ " weekly | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-08 | \n",
+ " 2023-01-09 | \n",
+ " 12 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " biweekly | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-15 | \n",
+ " 2023-01-16 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 28 | \n",
+ " quarterly | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-22 | \n",
+ " 2023-01-23 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ " weekly | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-29 | \n",
+ " 2023-01-30 | \n",
+ " 13 | \n",
+ " 4 | \n",
+ " 27 | \n",
+ " biweekly | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start_date end_date open_items red_flags num_employees assessment_type\n",
+ "0 2023-01-01 2023-01-02 10 2 30 weekly\n",
+ "1 2023-01-08 2023-01-09 12 1 25 biweekly\n",
+ "2 2023-01-15 2023-01-16 11 3 28 quarterly\n",
+ "3 2023-01-22 2023-01-23 9 1 30 weekly\n",
+ "4 2023-01-29 2023-01-30 13 4 27 biweekly"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_dummy.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " open_items | \n",
+ " red_flags | \n",
+ " num_employees | \n",
+ " assessment_type_biweekly | \n",
+ " assessment_type_quarterly | \n",
+ " assessment_type_weekly | \n",
+ " open_items_weekly_lag_1 | \n",
+ " open_items_biweekly_lag_1 | \n",
+ " open_items_quarterly_lag_1 | \n",
+ " open_items_weekly_ma_3 | \n",
+ " open_items_biweekly_ma_3 | \n",
+ " open_items_quarterly_ma_3 | \n",
+ " time_since_last_event | \n",
+ " percentage_change_open_items | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-15 | \n",
+ " 2023-01-16 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 28 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 12.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " -8.333333 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-22 | \n",
+ " 2023-01-23 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 11.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 10.666667 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " -18.181818 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-29 | \n",
+ " 2023-01-30 | \n",
+ " 13 | \n",
+ " 4 | \n",
+ " 27 | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 11.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 44.444444 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start_date end_date open_items red_flags num_employees \\\n",
+ "2 2023-01-15 2023-01-16 11 3 28 \n",
+ "3 2023-01-22 2023-01-23 9 1 30 \n",
+ "4 2023-01-29 2023-01-30 13 4 27 \n",
+ "\n",
+ " assessment_type_biweekly assessment_type_quarterly \\\n",
+ "2 False True \n",
+ "3 False False \n",
+ "4 True False \n",
+ "\n",
+ " assessment_type_weekly open_items_weekly_lag_1 open_items_biweekly_lag_1 \\\n",
+ "2 False 0.0 0.0 \n",
+ "3 True 11.0 0.0 \n",
+ "4 False 0.0 9.0 \n",
+ "\n",
+ " open_items_quarterly_lag_1 open_items_weekly_ma_3 \\\n",
+ "2 12.0 0.000000 \n",
+ "3 0.0 10.666667 \n",
+ "4 0.0 0.000000 \n",
+ "\n",
+ " open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n",
+ "2 0.0 11.0 7.0 \n",
+ "3 0.0 0.0 7.0 \n",
+ "4 11.0 0.0 7.0 \n",
+ "\n",
+ " percentage_change_open_items \n",
+ "2 -8.333333 \n",
+ "3 -18.181818 \n",
+ "4 44.444444 "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Create a dummy dataset with past 5 assessments\n",
+ "data_dummy = {\n",
+ " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
+ " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
+ " 'open_items': [10, 12, 11, 9, 13],\n",
+ " 'red_flags': [2, 1, 3, 1, 4],\n",
+ " 'num_employees': [30, 25, 28, 30, 27],\n",
+ " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
+ "}\n",
+ "\n",
+ "df_dummy = pd.DataFrame(data_dummy)\n",
+ "\n",
+ "# Convert 'assessment_type' to categorical (one-hot encoding)\n",
+ "df_dummy = pd.get_dummies(df_dummy, columns=['assessment_type'], drop_first=False)\n",
+ "\n",
+ "# Create lagged features for each assessment type\n",
+ "df_dummy['open_items_weekly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_weekly']\n",
+ "df_dummy['open_items_biweekly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_biweekly']\n",
+ "df_dummy['open_items_quarterly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_quarterly']\n",
+ "\n",
+ "# Create moving averages for each assessment type\n",
+ "df_dummy['open_items_weekly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_weekly']\n",
+ "df_dummy['open_items_biweekly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_biweekly']\n",
+ "df_dummy['open_items_quarterly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_quarterly']\n",
+ "\n",
+ "# Add time since last event (days between assessments)\n",
+ "df_dummy['start_date'] = pd.to_datetime(df_dummy['start_date'])\n",
+ "df_dummy['time_since_last_event'] = df_dummy['start_date'].diff().dt.days\n",
+ "\n",
+ "# Add percentage change in open items\n",
+ "df_dummy['percentage_change_open_items'] = df_dummy['open_items'].pct_change() * 100\n",
+ "\n",
+ "# Remove any rows with NaN values created by lagging or rolling window calculations\n",
+ "df_dummy.dropna(inplace=True)\n",
+ "\n",
+ "# Display the final DataFrame with all time-based features\n",
+ "df_dummy.head() \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Create a dummy dataset with past 5 assessments\n",
+ "data_dummy = {\n",
+ " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
+ " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
+ " 'open_items': [10, 12, 11, 9, 13],\n",
+ " 'red_flags': [2, 1, 3, 1, 4],\n",
+ " 'num_employees': [30, 25, 28, 30, 27],\n",
+ " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
+ "}\n",
+ "\n",
+ "df_dummy = pd.DataFrame(data_dummy)\n",
+ "\n",
+ "# Save the DataFrame as a CSV file\n",
+ "df_dummy.to_csv('dummy_assessment_data.csv', index=False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " open_items | \n",
+ " red_flags | \n",
+ " num_employees | \n",
+ " assessment_type_biweekly | \n",
+ " assessment_type_quarterly | \n",
+ " assessment_type_weekly | \n",
+ " open_items_assessment_type_weekly_lag_1 | \n",
+ " open_items_assessment_type_biweekly_lag_1 | \n",
+ " open_items_assessment_type_quarterly_lag_1 | \n",
+ " open_items_weekly_ma_3 | \n",
+ " open_items_biweekly_ma_3 | \n",
+ " open_items_quarterly_ma_3 | \n",
+ " time_since_last_event | \n",
+ " percentage_change_open_items | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [start_date, end_date, open_items, red_flags, num_employees, assessment_type_biweekly, assessment_type_quarterly, assessment_type_weekly, open_items_assessment_type_weekly_lag_1, open_items_assessment_type_biweekly_lag_1, open_items_assessment_type_quarterly_lag_1, open_items_weekly_ma_3, open_items_biweekly_ma_3, open_items_quarterly_ma_3, time_since_last_event, percentage_change_open_items]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Create a dummy dataset with past 5 assessments\n",
+ "data_dummy = {\n",
+ " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
+ " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
+ " 'open_items': [10, 12, 11, 9, 13],\n",
+ " 'red_flags': [2, 1, 3, 1, 4],\n",
+ " 'num_employees': [30, 25, 28, 30, 27],\n",
+ " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
+ "}\n",
+ "\n",
+ "df_dummy = pd.DataFrame(data_dummy)\n",
+ "\n",
+ "# Convert 'assessment_type' to categorical (one-hot encoding)\n",
+ "df_dummy = pd.get_dummies(df_dummy, columns=['assessment_type'], drop_first=False)\n",
+ "\n",
+ "# Define a function to create lagged features based on assessment type\n",
+ "def create_lagged_features(df, col, assessment_col):\n",
+ " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
+ " df[lagged_col] = df[df[assessment_col] == 1][col].shift(1)\n",
+ " return df\n",
+ "\n",
+ "# Create lagged features for each assessment type\n",
+ "df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_weekly')\n",
+ "df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_biweekly')\n",
+ "df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_quarterly')\n",
+ "\n",
+ "# Fill NaNs with 0 or forward-fill them depending on how you want to handle missing lags\n",
+ "df_dummy.fillna(0, inplace=True)\n",
+ "\n",
+ "# Create moving averages for each assessment type\n",
+ "df_dummy['open_items_weekly_ma_3'] = df_dummy[df_dummy['assessment_type_weekly'] == 1]['open_items'].rolling(window=3).mean()\n",
+ "df_dummy['open_items_biweekly_ma_3'] = df_dummy[df_dummy['assessment_type_biweekly'] == 1]['open_items'].rolling(window=3).mean()\n",
+ "df_dummy['open_items_quarterly_ma_3'] = df_dummy[df_dummy['assessment_type_quarterly'] == 1]['open_items'].rolling(window=3).mean()\n",
+ "\n",
+ "# Add time since last event (days between assessments)\n",
+ "df_dummy['start_date'] = pd.to_datetime(df_dummy['start_date'])\n",
+ "df_dummy['time_since_last_event'] = df_dummy['start_date'].diff().dt.days\n",
+ "\n",
+ "# Add percentage change in open items\n",
+ "df_dummy['percentage_change_open_items'] = df_dummy['open_items'].pct_change() * 100\n",
+ "\n",
+ "# Remove any rows with NaN values created by lagging or rolling window calculations\n",
+ "df_dummy.dropna(inplace=True)\n",
+ "\n",
+ "# Display the final DataFrame with all time-based features\n",
+ "df_dummy.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " open_items | \n",
+ " red_flags | \n",
+ " num_employees | \n",
+ " assessment_type_biweekly | \n",
+ " assessment_type_quarterly | \n",
+ " assessment_type_weekly | \n",
+ " open_items_assessment_type_weekly_lag_1 | \n",
+ " open_items_assessment_type_biweekly_lag_1 | \n",
+ " open_items_assessment_type_quarterly_lag_1 | \n",
+ " open_items_weekly_ma_3 | \n",
+ " open_items_biweekly_ma_3 | \n",
+ " open_items_quarterly_ma_3 | \n",
+ " time_since_last_event | \n",
+ " percentage_change_open_items | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 | \n",
+ " 2023-01-02 | \n",
+ " 10 | \n",
+ " 2 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 10.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-08 | \n",
+ " 2023-01-09 | \n",
+ " 12 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 10.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 10.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 20.000000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-15 | \n",
+ " 2023-01-16 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 28 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 10.0 | \n",
+ " 12.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " -8.333333 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-22 | \n",
+ " 2023-01-23 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " 9.0 | \n",
+ " 12.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " -18.181818 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-29 | \n",
+ " 2023-01-30 | \n",
+ " 13 | \n",
+ " 4 | \n",
+ " 27 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 13.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " 44.444444 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start_date end_date open_items red_flags num_employees \\\n",
+ "0 2023-01-01 2023-01-02 10 2 30 \n",
+ "1 2023-01-08 2023-01-09 12 1 25 \n",
+ "2 2023-01-15 2023-01-16 11 3 28 \n",
+ "3 2023-01-22 2023-01-23 9 1 30 \n",
+ "4 2023-01-29 2023-01-30 13 4 27 \n",
+ "\n",
+ " assessment_type_biweekly assessment_type_quarterly \\\n",
+ "0 0 0 \n",
+ "1 1 0 \n",
+ "2 0 1 \n",
+ "3 0 0 \n",
+ "4 1 0 \n",
+ "\n",
+ " assessment_type_weekly open_items_assessment_type_weekly_lag_1 \\\n",
+ "0 1 0.0 \n",
+ "1 0 10.0 \n",
+ "2 0 0.0 \n",
+ "3 1 0.0 \n",
+ "4 0 9.0 \n",
+ "\n",
+ " open_items_assessment_type_biweekly_lag_1 \\\n",
+ "0 0.0 \n",
+ "1 0.0 \n",
+ "2 12.0 \n",
+ "3 0.0 \n",
+ "4 0.0 \n",
+ "\n",
+ " open_items_assessment_type_quarterly_lag_1 open_items_weekly_ma_3 \\\n",
+ "0 0.0 10.0 \n",
+ "1 0.0 10.0 \n",
+ "2 0.0 10.0 \n",
+ "3 11.0 9.0 \n",
+ "4 0.0 9.0 \n",
+ "\n",
+ " open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 12.0 0.0 7.0 \n",
+ "2 12.0 11.0 7.0 \n",
+ "3 12.0 11.0 7.0 \n",
+ "4 13.0 11.0 7.0 \n",
+ "\n",
+ " percentage_change_open_items \n",
+ "0 0.000000 \n",
+ "1 20.000000 \n",
+ "2 -8.333333 \n",
+ "3 -18.181818 \n",
+ "4 44.444444 "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n",
+ "\n",
+ "class DataPreprocessor:\n",
+ " def __init__(self, input_path, output_dir):\n",
+ " self.input_path = input_path\n",
+ " self.output_dir = output_dir\n",
+ " self.df = None\n",
+ "\n",
+ " def load_data(self):\n",
+ " self.df = pd.read_csv(self.input_path)\n",
+ "\n",
+ " def preprocess(self):\n",
+ " # Convert 'assessment_type' to categorical (one-hot encoding)\n",
+ " self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)\n",
+ "\n",
+ " # Convert boolean columns to 1s and 0s\n",
+ " self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)\n",
+ " self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)\n",
+ " self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)\n",
+ "\n",
+ " # Function to create lagged features based on assessment type\n",
+ " def create_lagged_features(df, col, assessment_col):\n",
+ " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
+ " df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n",
+ " return df\n",
+ "\n",
+ " # Create lagged features for each assessment type\n",
+ " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')\n",
+ " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')\n",
+ " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')\n",
+ "\n",
+ " # Fill NaNs with 0 instead of dropping rows\n",
+ " self.df.fillna(0, inplace=True)\n",
+ "\n",
+ " # Create moving averages for each assessment type\n",
+ " self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ " self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ " self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ "\n",
+ " # Add time since last event (days between assessments)\n",
+ " self.df['start_date'] = pd.to_datetime(self.df['start_date'])\n",
+ " self.df['time_since_last_event'] = self.df['start_date'].diff().dt.days.fillna(0)\n",
+ "\n",
+ " # Add percentage change in open items\n",
+ " self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100\n",
+ "\n",
+ " def save_data(self):\n",
+ " output_path = os.path.join(self.output_dir, 'preprocessed_data.csv')\n",
+ " self.df.to_csv(output_path, index=False)\n",
+ " return output_path\n",
+ "\n",
+ " def run(self):\n",
+ " self.load_data()\n",
+ " self.preprocess()\n",
+ " return self.save_data()\n",
+ "\n",
+ "\n",
+ "preprocessor = DataPreprocessor('path/to/input.csv', 'path/to/output/directory')\n",
+ "output_file = preprocessor.run()\n",
+ "# print(f\"Preprocessed data saved to: {output_file}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " open_items | \n",
+ " red_flags | \n",
+ " num_employees | \n",
+ " assessment_type_biweekly | \n",
+ " assessment_type_quarterly | \n",
+ " assessment_type_weekly | \n",
+ " open_items_assessment_type_weekly_lag_1 | \n",
+ " open_items_assessment_type_biweekly_lag_1 | \n",
+ " open_items_assessment_type_quarterly_lag_1 | \n",
+ " open_items_weekly_ma_3 | \n",
+ " open_items_biweekly_ma_3 | \n",
+ " open_items_quarterly_ma_3 | \n",
+ " time_since_last_event | \n",
+ " percentage_change_open_items | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-15 | \n",
+ " 2023-01-16 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 28 | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 12.0 | \n",
+ " 0.0 | \n",
+ " 10.0 | \n",
+ " 12.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " -8.333333 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-22 | \n",
+ " 2023-01-23 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " 9.0 | \n",
+ " 12.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " -18.181818 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-29 | \n",
+ " 2023-01-30 | \n",
+ " 13 | \n",
+ " 4 | \n",
+ " 27 | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 13.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " 44.444444 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start_date end_date open_items red_flags num_employees \\\n",
+ "2 2023-01-15 2023-01-16 11 3 28 \n",
+ "3 2023-01-22 2023-01-23 9 1 30 \n",
+ "4 2023-01-29 2023-01-30 13 4 27 \n",
+ "\n",
+ " assessment_type_biweekly assessment_type_quarterly \\\n",
+ "2 False True \n",
+ "3 False False \n",
+ "4 True False \n",
+ "\n",
+ " assessment_type_weekly open_items_assessment_type_weekly_lag_1 \\\n",
+ "2 False 0.0 \n",
+ "3 True 0.0 \n",
+ "4 False 9.0 \n",
+ "\n",
+ " open_items_assessment_type_biweekly_lag_1 \\\n",
+ "2 12.0 \n",
+ "3 0.0 \n",
+ "4 0.0 \n",
+ "\n",
+ " open_items_assessment_type_quarterly_lag_1 open_items_weekly_ma_3 \\\n",
+ "2 0.0 10.0 \n",
+ "3 11.0 9.0 \n",
+ "4 0.0 9.0 \n",
+ "\n",
+ " open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n",
+ "2 12.0 11.0 7.0 \n",
+ "3 12.0 11.0 7.0 \n",
+ "4 13.0 11.0 7.0 \n",
+ "\n",
+ " percentage_change_open_items \n",
+ "2 -8.333333 \n",
+ "3 -18.181818 \n",
+ "4 44.444444 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " open_items | \n",
+ " red_flags | \n",
+ " num_employees | \n",
+ " assessment_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 | \n",
+ " 2023-01-02 | \n",
+ " 10 | \n",
+ " 2 | \n",
+ " 30 | \n",
+ " weekly | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-08 | \n",
+ " 2023-01-09 | \n",
+ " 12 | \n",
+ " 1 | \n",
+ " 25 | \n",
+ " biweekly | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-15 | \n",
+ " 2023-01-16 | \n",
+ " 11 | \n",
+ " 3 | \n",
+ " 28 | \n",
+ " quarterly | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-22 | \n",
+ " 2023-01-23 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ " weekly | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-29 | \n",
+ " 2023-01-30 | \n",
+ " 13 | \n",
+ " 4 | \n",
+ " 27 | \n",
+ " biweekly | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start_date end_date open_items red_flags num_employees assessment_type\n",
+ "0 2023-01-01 2023-01-02 10 2 30 weekly\n",
+ "1 2023-01-08 2023-01-09 12 1 25 biweekly\n",
+ "2 2023-01-15 2023-01-16 11 3 28 quarterly\n",
+ "3 2023-01-22 2023-01-23 9 1 30 weekly\n",
+ "4 2023-01-29 2023-01-30 13 4 27 biweekly"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_dummy.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Create a dummy dataset with past 5 assessments\n",
+ "data_dummy = {\n",
+ " 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
+ " 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
+ " 'open_items': [10, 12, 11, 9, 13],\n",
+ " 'red_flags': [2, 1, 3, 1, 4],\n",
+ " 'num_employees': [30, 25, 28, 30, 27],\n",
+ " 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
+ "}\n",
+ "\n",
+ "df = pd.DataFrame(data_dummy)\n",
+ "\n",
+ "# Convert 'assessment_type' to categorical (one-hot encoding)\n",
+ "df = pd.get_dummies(df, columns=['assessment_type'], drop_first=False)\n",
+ "\n",
+ "# Convert boolean columns to 1s and 0s\n",
+ "df['assessment_type_weekly'] = df['assessment_type_weekly'].astype(int)\n",
+ "df['assessment_type_biweekly'] = df['assessment_type_biweekly'].astype(int)\n",
+ "df['assessment_type_quarterly'] = df['assessment_type_quarterly'].astype(int)\n",
+ "\n",
+ "# Function to create lagged features based on assessment type\n",
+ "def create_lagged_features(df, col, assessment_col):\n",
+ " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
+ " df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n",
+ " return df\n",
+ "\n",
+ "# Create lagged features for each assessment type\n",
+ "df = create_lagged_features(df, 'open_items', 'assessment_type_weekly')\n",
+ "df = create_lagged_features(df, 'open_items', 'assessment_type_biweekly')\n",
+ "df = create_lagged_features(df, 'open_items', 'assessment_type_quarterly')\n",
+ "\n",
+ "# Fill NaNs with 0 instead of dropping rows\n",
+ "df.fillna(0, inplace=True)\n",
+ "\n",
+ "# Create moving averages for each assessment type\n",
+ "df['open_items_weekly_ma_3'] = df['open_items'].where(df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ "df['open_items_biweekly_ma_3'] = df['open_items'].where(df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ "df['open_items_quarterly_ma_3'] = df['open_items'].where(df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ "\n",
+ "# Add time since last event (days between assessments)\n",
+ "df['start_date'] = pd.to_datetime(df['start_date'])\n",
+ "df['time_since_last_event'] = df['start_date'].diff().dt.days.fillna(0)\n",
+ "\n",
+ "# Add percentage change in open items\n",
+ "df['percentage_change_open_items'] = df['open_items'].pct_change().fillna(0) * 100\n",
+ "\n",
+ "# Display the final DataFrame with all time-based features\n",
+ "df.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n",
+ "\n",
+ "class DataPreprocessor:\n",
+ " def __init__(self, input_path, company_id):\n",
+ " self.input_path = input_path\n",
+ " self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id)\n",
+ " self.company_id = company_id\n",
+ " self.df = None\n",
+ "\n",
+ " def load_data(self):\n",
+ " self.df = pd.read_csv(self.input_path)\n",
+ "\n",
+ " def preprocess(self):\n",
+ " # Convert 'start_date' and 'end_date' to datetime\n",
+ " self.df['start_date'] = pd.to_datetime(self.df['start_date'])\n",
+ " self.df['end_date'] = pd.to_datetime(self.df['end_date'])\n",
+ "\n",
+ " # Add duration (in days) by subtracting start_date from end_date\n",
+ " self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days\n",
+ "\n",
+ " # Drop the 'start_date' and 'end_date' columns as they are not needed for training\n",
+ " self.df.drop(columns=['start_date', 'end_date'], inplace=True)\n",
+ "\n",
+ " # Convert 'assessment_type' to categorical (one-hot encoding)\n",
+ " self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)\n",
+ "\n",
+ " # Convert boolean columns to 1s and 0s\n",
+ " self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)\n",
+ " self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)\n",
+ " self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)\n",
+ "\n",
+ " # Function to create lagged features based on assessment type\n",
+ " def create_lagged_features(df, col, assessment_col):\n",
+ " lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
+ " df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n",
+ " return df\n",
+ "\n",
+ " # Create lagged features for each assessment type\n",
+ " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')\n",
+ " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')\n",
+ " self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')\n",
+ "\n",
+ " # Fill NaNs with 0 instead of dropping rows\n",
+ " self.df.fillna(0, inplace=True)\n",
+ "\n",
+ " # Create moving averages for each assessment type\n",
+ " self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ " self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ " self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
+ "\n",
+ " # Add percentage change in open items\n",
+ " self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100\n",
+ "\n",
+ " def save_data(self):\n",
+ " os.makedirs(self.output_dir, exist_ok=True) # Ensure output directory exists\n",
+ " output_path = os.path.join(self.output_dir, 'output.csv')\n",
+ " self.df.to_csv(output_path, index=False)\n",
+ " return output_path\n",
+ "\n",
+ " def run(self):\n",
+ " self.load_data()\n",
+ " self.preprocess()\n",
+ " return self.save_data()\n",
+ "\n",
+ "# Example usage:\n",
+ "# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123')\n",
+ "# processed_data_path = preprocessor.run()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dp = DataPreprocessor(\n",
+ " input_path=\"/root/ds_erp_ai/data/raw/dummy_assessment_data.csv\",\n",
+ " company_id=\"testid\"\n",
+ ")\n",
+ "\n",
+ "\n",
+ "res = dp.run()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'data/processed/assessment_prediction/testid/output.csv'"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model saved to models/assessment_prediction/testid/testid_model.pkl\n",
+ "Latest assessment data saved to models/assessment_prediction/testid/testid_latest_data.csv\n",
+ "Model Evaluation Metrics:\n",
+ "Mean Absolute Error (MAE): 1.3099999999999996\n",
+ "Mean Squared Error (MSE): 2.3089999999999997\n",
+ "R-squared (R²): nan\n",
+ "The model was saved at: models/assessment_prediction/testid/testid_model.pkl\n",
+ "The latest data was saved at: models/assessment_prediction/testid/testid_latest_data.csv\n",
+ "Evaluation Results: {'mae': 1.3099999999999996, 'mse': 2.3089999999999997, 'r2': nan}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/root/ds_erp_ai/erp/lib/python3.10/site-packages/sklearn/metrics/_regression.py:1211: UndefinedMetricWarning: R^2 score is not well-defined with less than two samples.\n",
+ " warnings.warn(msg, UndefinedMetricWarning)\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.multioutput import MultiOutputRegressor\n",
+ "import joblib\n",
+ "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
+ "\n",
+ "class ModelTrainer:\n",
+ " def __init__(self, preprocessed_data_path, company_id, model):\n",
+ " self.preprocessed_data_path = preprocessed_data_path\n",
+ " self.output_dir = os.path.join('models', 'assessment_prediction', company_id)\n",
+ " self.company_id = company_id\n",
+ " self.df = None\n",
+ " self.model = model # Model passed as an argument\n",
+ " self.X_test = None\n",
+ " self.y_test = None\n",
+ "\n",
+ " def load_data(self):\n",
+ " self.df = pd.read_csv(self.preprocessed_data_path)\n",
+ "\n",
+ " def train_model(self):\n",
+ " # Split data into features (X) and target variables (y)\n",
+ " X = self.df.drop(columns=['open_items', 'red_flags'])\n",
+ " y = self.df[['open_items', 'red_flags']] # Multi-target for open items and red flags\n",
+ "\n",
+ " # Split into training and test sets with 10% as test size\n",
+ " X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42)\n",
+ "\n",
+ " # Train the model\n",
+ " self.model.fit(X_train, y_train)\n",
+ "\n",
+ " # Save the trained model\n",
+ " os.makedirs(self.output_dir, exist_ok=True)\n",
+ " model_path = os.path.join(self.output_dir, f'{self.company_id}_model.pkl')\n",
+ " joblib.dump(self.model, model_path)\n",
+ " print(f\"Model saved to {model_path}\")\n",
+ "\n",
+ " # Save the latest row (last assessment data) for inference\n",
+ " latest_data_path = os.path.join(self.output_dir, f'{self.company_id}_latest_data.csv')\n",
+ " self.df.tail(1).to_csv(latest_data_path, index=False)\n",
+ " print(f\"Latest assessment data saved to {latest_data_path}\")\n",
+ "\n",
+ " # Return the model path and latest data path\n",
+ " return model_path, latest_data_path\n",
+ "\n",
+ " def evaluate_model(self):\n",
+ " # Predict using the test data\n",
+ " y_pred = self.model.predict(self.X_test)\n",
+ "\n",
+ " # Calculate evaluation metrics\n",
+ " mae = mean_absolute_error(self.y_test, y_pred)\n",
+ " mse = mean_squared_error(self.y_test, y_pred)\n",
+ " r2 = r2_score(self.y_test, y_pred)\n",
+ "\n",
+ " print(\"Model Evaluation Metrics:\")\n",
+ " print(f\"Mean Absolute Error (MAE): {mae}\")\n",
+ " print(f\"Mean Squared Error (MSE): {mse}\")\n",
+ " print(f\"R-squared (R²): {r2}\")\n",
+ "\n",
+ " # Return evaluation results\n",
+ " return {'mae': mae, 'mse': mse, 'r2': r2}\n",
+ "\n",
+ " def run(self):\n",
+ " # Load data and train the model\n",
+ " self.load_data()\n",
+ " model_path, latest_data_path = self.train_model()\n",
+ "\n",
+ " # Evaluate the model immediately after training\n",
+ " evaluation_results = self.evaluate_model()\n",
+ "\n",
+ " return model_path, latest_data_path, evaluation_results\n",
+ "\n",
+ "# Example usage\n",
+ "model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))\n",
+ "trainer = ModelTrainer(preprocessed_data_path=res, company_id='testid', model=model)\n",
+ "model_path, latest_data_path, evaluation_results = trainer.run()\n",
+ "print(f\"The model was saved at: {model_path}\")\n",
+ "print(f\"The latest data was saved at: {latest_data_path}\")\n",
+ "print(f\"Evaluation Results: {evaluation_results}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'data/processed/assessment_prediction/testid/output.csv'"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'models/assessment_prediction/testid/testid_model.pkl'"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "e"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model loaded from models/assessment_prediction/testid/testid_model.pkl\n",
+ "Latest data loaded from models/assessment_prediction/testid/testid_latest_data.csv\n",
+ "\n",
+ "Forecasting assessment 1/5\n",
+ "\n",
+ "Forecasting assessment 2/5\n",
+ "\n",
+ "Forecasting assessment 3/5\n",
+ "\n",
+ "Forecasting assessment 4/5\n",
+ "\n",
+ "Forecasting assessment 5/5\n",
+ "[{'forecast_step': 1, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 2, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 3, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 4, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 5, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import joblib\n",
+ "import os\n",
+ "\n",
+ "class AssessmentInference:\n",
+ " def __init__(self, company_id, num_assessments, model_dir='models'):\n",
+ " self.company_id = company_id\n",
+ " self.num_assessments = num_assessments\n",
+ " self.model_dir = model_dir\n",
+ " self.model = None\n",
+ " self.latest_data = None\n",
+ "\n",
+ " def load_model(self):\n",
+ " # Load the trained model\n",
+ " model_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_model.pkl')\n",
+ " self.model = joblib.load(model_path)\n",
+ " print(f\"Model loaded from {model_path}\")\n",
+ "\n",
+ " def load_latest_data(self):\n",
+ " # Load the latest assessment data\n",
+ " latest_data_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_latest_data.csv')\n",
+ " self.latest_data = pd.read_csv(latest_data_path)\n",
+ " print(f\"Latest data loaded from {latest_data_path}\")\n",
+ "\n",
+ " def predict_next_assessment(self, current_data, assessment_type):\n",
+ " # Update assessment type (weekly, biweekly, quarterly) in the data for prediction\n",
+ " current_data['assessment_type_weekly'] = 1 if assessment_type == 'weekly' else 0\n",
+ " current_data['assessment_type_biweekly'] = 1 if assessment_type == 'biweekly' else 0\n",
+ " current_data['assessment_type_quarterly'] = 1 if assessment_type == 'quarterly' else 0\n",
+ "\n",
+ " # Exclude target variables (open_items, red_flags) from the feature set\n",
+ " features = current_data.drop(columns=['open_items', 'red_flags'])\n",
+ "\n",
+ " # Predict the next open items and red flags\n",
+ " prediction = self.model.predict(features)\n",
+ " open_items_pred, red_flags_pred = prediction[0]\n",
+ "\n",
+ " # Ensure the predictions are integers by rounding\n",
+ " open_items_pred = int(round(open_items_pred))\n",
+ " red_flags_pred = int(round(red_flags_pred))\n",
+ "\n",
+ " return {\n",
+ " 'assessment_type': assessment_type,\n",
+ " 'open_items': open_items_pred,\n",
+ " 'red_flags': red_flags_pred\n",
+ " }\n",
+ "\n",
+ " def predict_next_assessments(self):\n",
+ " predictions = []\n",
+ " current_data = self.latest_data.copy()\n",
+ "\n",
+ " # Iteratively forecast the next assessments\n",
+ " for i in range(self.num_assessments):\n",
+ " print(f\"\\nForecasting assessment {i + 1}/{self.num_assessments}\")\n",
+ "\n",
+ " # Predict for weekly, biweekly, and quarterly for the same forecast step\n",
+ " weekly_prediction = self.predict_next_assessment(current_data, 'weekly')\n",
+ " biweekly_prediction = self.predict_next_assessment(current_data, 'biweekly')\n",
+ " quarterly_prediction = self.predict_next_assessment(current_data, 'quarterly')\n",
+ "\n",
+ " # Append predictions for all types in one forecast step\n",
+ " predictions.append({\n",
+ " 'forecast_step': i + 1,\n",
+ " 'weekly': weekly_prediction,\n",
+ " 'biweekly': biweekly_prediction,\n",
+ " 'quarterly': quarterly_prediction\n",
+ " })\n",
+ "\n",
+ " # Update the current data with the weekly prediction (or any of the predictions) for the next step\n",
+ " current_data['open_items'] = weekly_prediction['open_items']\n",
+ " current_data['red_flags'] = weekly_prediction['red_flags']\n",
+ "\n",
+ " return predictions\n",
+ "\n",
+ " def run(self):\n",
+ " self.load_model()\n",
+ " self.load_latest_data()\n",
+ " predictions = self.predict_next_assessments()\n",
+ " return predictions\n",
+ "\n",
+ "\n",
+ "# Example usage\n",
+ "inference = AssessmentInference(company_id='testid', num_assessments=5)\n",
+ "predictions = inference.run()\n",
+ "print(predictions)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "erp",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scripts/run_assessment_prediction_trainer.py b/scripts/run_assessment_prediction_trainer.py
index 2618b86..44d7b89 100644
--- a/scripts/run_assessment_prediction_trainer.py
+++ b/scripts/run_assessment_prediction_trainer.py
@@ -26,7 +26,7 @@ class CompanyModelPipeline:
logger.info(f"Starting preprocessing for company {company_id}.")
- # Step 1: Preprocess the data
+ # Step 1 : Preprocess the data
preprocessor = DataPreprocessor(input_path=input_path, company_id=company_id)
processed_data_path = preprocessor.run()
logger.info(f"Data preprocessing completed for company {company_id}. Processed data saved to {processed_data_path}.")
diff --git a/src/pipeline/data_preprocessor.py b/src/pipeline/data_preprocessor.py
index 8bfa129..adccf94 100644
--- a/src/pipeline/data_preprocessor.py
+++ b/src/pipeline/data_preprocessor.py
@@ -1,5 +1,13 @@
import pandas as pd
import os
+import logging
+from logging.handlers import RotatingFileHandler
+
+
+handler = RotatingFileHandler('/root/ds_erp_ai/logs/prediction_pipeline.log', maxBytes=100000, backupCount=3)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(handler)
class DataPreprocessor:
def __init__(self, input_path, company_id):
diff --git a/src/pipeline/inference.py b/src/pipeline/inference.py
index e69de29..141feae 100644
--- a/src/pipeline/inference.py
+++ b/src/pipeline/inference.py
@@ -0,0 +1,85 @@
+import pandas as pd
+import joblib
+import os
+
+class AssessmentInference:
+ def __init__(self, company_id, num_assessments, model_dir='models'):
+ self.company_id = company_id
+ self.num_assessments = num_assessments
+ self.model_dir = model_dir
+ self.model = None
+ self.latest_data = None
+
+ def load_model(self):
+ # Load the trained model
+ model_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_model.pkl')
+ self.model = joblib.load(model_path)
+ print(f"Model loaded from {model_path}")
+
+ def load_latest_data(self):
+ # Load the latest assessment data
+ latest_data_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_latest_data.csv')
+ self.latest_data = pd.read_csv(latest_data_path)
+ print(f"Latest data loaded from {latest_data_path}")
+
+ def predict_next_assessment(self, current_data, assessment_type):
+ # Update assessment type (weekly, biweekly, quarterly) in the data for prediction
+ current_data['assessment_type_weekly'] = 1 if assessment_type == 'weekly' else 0
+ current_data['assessment_type_biweekly'] = 1 if assessment_type == 'biweekly' else 0
+ current_data['assessment_type_quarterly'] = 1 if assessment_type == 'quarterly' else 0
+
+ # Exclude target variables (open_items, red_flags) from the feature set
+ features = current_data.drop(columns=['open_items', 'red_flags'])
+
+ # Predict the next open items and red flags
+ prediction = self.model.predict(features)
+ open_items_pred, red_flags_pred = prediction[0]
+
+ # Ensure the predictions are integers by rounding
+ open_items_pred = int(round(open_items_pred))
+ red_flags_pred = int(round(red_flags_pred))
+
+ return {
+ 'assessment_type': assessment_type,
+ 'open_items': open_items_pred,
+ 'red_flags': red_flags_pred
+ }
+
+ def predict_next_assessments(self):
+ predictions = []
+ current_data = self.latest_data.copy()
+
+ # Iteratively forecast the next assessments
+ for i in range(self.num_assessments):
+ print(f"\nForecasting assessment {i + 1}/{self.num_assessments}")
+
+ # Predict for weekly, biweekly, and quarterly for the same forecast step
+ weekly_prediction = self.predict_next_assessment(current_data, 'weekly')
+ biweekly_prediction = self.predict_next_assessment(current_data, 'biweekly')
+ quarterly_prediction = self.predict_next_assessment(current_data, 'quarterly')
+
+ # Append predictions for all types in one forecast step
+ predictions.append({
+ 'forecast_step': i + 1,
+ 'weekly': weekly_prediction,
+ 'biweekly': biweekly_prediction,
+ 'quarterly': quarterly_prediction
+ })
+
+ # Update the current data with the weekly prediction (or any of the predictions) for the next step
+ current_data['open_items'] = weekly_prediction['open_items']
+ current_data['red_flags'] = weekly_prediction['red_flags']
+
+ return predictions
+
+ def run(self):
+ self.load_model()
+ self.load_latest_data()
+ predictions = self.predict_next_assessments()
+ return predictions
+
+
+# Example usage
+#inference = AssessmentInference(company_id='testid', num_assessments=5)
+#predictions = inference.run()
+#print(predictions)
diff --git a/test.py b/test.py
index 1e47022..11fbc78 100644
--- a/test.py
+++ b/test.py
@@ -1,7 +1,20 @@
# Example usage
-from scripts.run_assessment_prediction_trainer import CompanyModelPipeline
-company_ids = ['company_123', 'company_456', 'company_789']
-input_base_path = '/root/ds_erp_ai/data/raw/dummy_assessment_data.csv' # The base path where the raw data for each company is stored
+'''from scripts.run_assessment_prediction_trainer import CompanyModelPipeline
+company_ids = ['testid']
+input_base_path = '/root/ds_erp_ai/data/raw/erp_assessment_prediction' # The base path where the raw data for each company is stored
pipeline = CompanyModelPipeline(company_ids=company_ids, input_base_path=input_base_path)
-pipeline.run_pipeline()
+pipeline.run_pipeline()'''
+
+from src.pipeline.inference import AssessmentInference
+
+
+
+inference = AssessmentInference(
+ company_id="testid",num_assessments=2
+)
+
+result = inference.run()
+
+
+print(result)