Files
erp-ai-latest/notebooks/test_prediction_pipeline.ipynb
T

1432 lines
56 KiB
Plaintext
Raw Normal View History

2024-09-12 21:36:02 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Create a dummy dataset with past 5 assessments\n",
"import pandas as pd\n",
"data_dummy = {\n",
" 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
" 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
" 'open_items': [10, 12, 11, 9, 13],\n",
" 'red_flags': [2, 1, 3, 1, 4],\n",
" 'num_employees': [30, 25, 28, 30, 27],\n",
" 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
"}\n",
"\n",
"df_dummy = pd.DataFrame(data_dummy)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>start_date</th>\n",
" <th>end_date</th>\n",
" <th>open_items</th>\n",
" <th>red_flags</th>\n",
" <th>num_employees</th>\n",
" <th>assessment_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-01-01</td>\n",
" <td>2023-01-02</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" <td>30</td>\n",
" <td>weekly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-01-08</td>\n",
" <td>2023-01-09</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>25</td>\n",
" <td>biweekly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-01-15</td>\n",
" <td>2023-01-16</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" <td>quarterly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-01-22</td>\n",
" <td>2023-01-23</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>30</td>\n",
" <td>weekly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-01-29</td>\n",
" <td>2023-01-30</td>\n",
" <td>13</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>biweekly</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" start_date end_date open_items red_flags num_employees assessment_type\n",
"0 2023-01-01 2023-01-02 10 2 30 weekly\n",
"1 2023-01-08 2023-01-09 12 1 25 biweekly\n",
"2 2023-01-15 2023-01-16 11 3 28 quarterly\n",
"3 2023-01-22 2023-01-23 9 1 30 weekly\n",
"4 2023-01-29 2023-01-30 13 4 27 biweekly"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dummy.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>start_date</th>\n",
" <th>end_date</th>\n",
" <th>open_items</th>\n",
" <th>red_flags</th>\n",
" <th>num_employees</th>\n",
" <th>assessment_type_biweekly</th>\n",
" <th>assessment_type_quarterly</th>\n",
" <th>assessment_type_weekly</th>\n",
" <th>open_items_weekly_lag_1</th>\n",
" <th>open_items_biweekly_lag_1</th>\n",
" <th>open_items_quarterly_lag_1</th>\n",
" <th>open_items_weekly_ma_3</th>\n",
" <th>open_items_biweekly_ma_3</th>\n",
" <th>open_items_quarterly_ma_3</th>\n",
" <th>time_since_last_event</th>\n",
" <th>percentage_change_open_items</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-01-15</td>\n",
" <td>2023-01-16</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>12.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>-8.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-01-22</td>\n",
" <td>2023-01-23</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>30</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>11.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>10.666667</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>-18.181818</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-01-29</td>\n",
" <td>2023-01-30</td>\n",
" <td>13</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>11.0</td>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>44.444444</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" start_date end_date open_items red_flags num_employees \\\n",
"2 2023-01-15 2023-01-16 11 3 28 \n",
"3 2023-01-22 2023-01-23 9 1 30 \n",
"4 2023-01-29 2023-01-30 13 4 27 \n",
"\n",
" assessment_type_biweekly assessment_type_quarterly \\\n",
"2 False True \n",
"3 False False \n",
"4 True False \n",
"\n",
" assessment_type_weekly open_items_weekly_lag_1 open_items_biweekly_lag_1 \\\n",
"2 False 0.0 0.0 \n",
"3 True 11.0 0.0 \n",
"4 False 0.0 9.0 \n",
"\n",
" open_items_quarterly_lag_1 open_items_weekly_ma_3 \\\n",
"2 12.0 0.000000 \n",
"3 0.0 10.666667 \n",
"4 0.0 0.000000 \n",
"\n",
" open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n",
"2 0.0 11.0 7.0 \n",
"3 0.0 0.0 7.0 \n",
"4 11.0 0.0 7.0 \n",
"\n",
" percentage_change_open_items \n",
"2 -8.333333 \n",
"3 -18.181818 \n",
"4 44.444444 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Create a dummy dataset with past 5 assessments\n",
"data_dummy = {\n",
" 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
" 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
" 'open_items': [10, 12, 11, 9, 13],\n",
" 'red_flags': [2, 1, 3, 1, 4],\n",
" 'num_employees': [30, 25, 28, 30, 27],\n",
" 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
"}\n",
"\n",
"df_dummy = pd.DataFrame(data_dummy)\n",
"\n",
"# Convert 'assessment_type' to categorical (one-hot encoding)\n",
"df_dummy = pd.get_dummies(df_dummy, columns=['assessment_type'], drop_first=False)\n",
"\n",
"# Create lagged features for each assessment type\n",
"df_dummy['open_items_weekly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_weekly']\n",
"df_dummy['open_items_biweekly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_biweekly']\n",
"df_dummy['open_items_quarterly_lag_1'] = df_dummy['open_items'].shift(1) * df_dummy['assessment_type_quarterly']\n",
"\n",
"# Create moving averages for each assessment type\n",
"df_dummy['open_items_weekly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_weekly']\n",
"df_dummy['open_items_biweekly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_biweekly']\n",
"df_dummy['open_items_quarterly_ma_3'] = df_dummy['open_items'].rolling(window=3).mean() * df_dummy['assessment_type_quarterly']\n",
"\n",
"# Add time since last event (days between assessments)\n",
"df_dummy['start_date'] = pd.to_datetime(df_dummy['start_date'])\n",
"df_dummy['time_since_last_event'] = df_dummy['start_date'].diff().dt.days\n",
"\n",
"# Add percentage change in open items\n",
"df_dummy['percentage_change_open_items'] = df_dummy['open_items'].pct_change() * 100\n",
"\n",
"# Remove any rows with NaN values created by lagging or rolling window calculations\n",
"df_dummy.dropna(inplace=True)\n",
"\n",
"# Display the final DataFrame with all time-based features\n",
"df_dummy.head() \n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Create a dummy dataset with past 5 assessments\n",
"data_dummy = {\n",
" 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
" 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
" 'open_items': [10, 12, 11, 9, 13],\n",
" 'red_flags': [2, 1, 3, 1, 4],\n",
" 'num_employees': [30, 25, 28, 30, 27],\n",
" 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
"}\n",
"\n",
"df_dummy = pd.DataFrame(data_dummy)\n",
"\n",
"# Save the DataFrame as a CSV file\n",
"df_dummy.to_csv('dummy_assessment_data.csv', index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>start_date</th>\n",
" <th>end_date</th>\n",
" <th>open_items</th>\n",
" <th>red_flags</th>\n",
" <th>num_employees</th>\n",
" <th>assessment_type_biweekly</th>\n",
" <th>assessment_type_quarterly</th>\n",
" <th>assessment_type_weekly</th>\n",
" <th>open_items_assessment_type_weekly_lag_1</th>\n",
" <th>open_items_assessment_type_biweekly_lag_1</th>\n",
" <th>open_items_assessment_type_quarterly_lag_1</th>\n",
" <th>open_items_weekly_ma_3</th>\n",
" <th>open_items_biweekly_ma_3</th>\n",
" <th>open_items_quarterly_ma_3</th>\n",
" <th>time_since_last_event</th>\n",
" <th>percentage_change_open_items</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [start_date, end_date, open_items, red_flags, num_employees, assessment_type_biweekly, assessment_type_quarterly, assessment_type_weekly, open_items_assessment_type_weekly_lag_1, open_items_assessment_type_biweekly_lag_1, open_items_assessment_type_quarterly_lag_1, open_items_weekly_ma_3, open_items_biweekly_ma_3, open_items_quarterly_ma_3, time_since_last_event, percentage_change_open_items]\n",
"Index: []"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Create a dummy dataset with past 5 assessments\n",
"data_dummy = {\n",
" 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
" 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
" 'open_items': [10, 12, 11, 9, 13],\n",
" 'red_flags': [2, 1, 3, 1, 4],\n",
" 'num_employees': [30, 25, 28, 30, 27],\n",
" 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
"}\n",
"\n",
"df_dummy = pd.DataFrame(data_dummy)\n",
"\n",
"# Convert 'assessment_type' to categorical (one-hot encoding)\n",
"df_dummy = pd.get_dummies(df_dummy, columns=['assessment_type'], drop_first=False)\n",
"\n",
"# Define a function to create lagged features based on assessment type\n",
"def create_lagged_features(df, col, assessment_col):\n",
" lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
" df[lagged_col] = df[df[assessment_col] == 1][col].shift(1)\n",
" return df\n",
"\n",
"# Create lagged features for each assessment type\n",
"df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_weekly')\n",
"df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_biweekly')\n",
"df_dummy = create_lagged_features(df_dummy, 'open_items', 'assessment_type_quarterly')\n",
"\n",
"# Fill NaNs with 0 or forward-fill them depending on how you want to handle missing lags\n",
"df_dummy.fillna(0, inplace=True)\n",
"\n",
"# Create moving averages for each assessment type\n",
"df_dummy['open_items_weekly_ma_3'] = df_dummy[df_dummy['assessment_type_weekly'] == 1]['open_items'].rolling(window=3).mean()\n",
"df_dummy['open_items_biweekly_ma_3'] = df_dummy[df_dummy['assessment_type_biweekly'] == 1]['open_items'].rolling(window=3).mean()\n",
"df_dummy['open_items_quarterly_ma_3'] = df_dummy[df_dummy['assessment_type_quarterly'] == 1]['open_items'].rolling(window=3).mean()\n",
"\n",
"# Add time since last event (days between assessments)\n",
"df_dummy['start_date'] = pd.to_datetime(df_dummy['start_date'])\n",
"df_dummy['time_since_last_event'] = df_dummy['start_date'].diff().dt.days\n",
"\n",
"# Add percentage change in open items\n",
"df_dummy['percentage_change_open_items'] = df_dummy['open_items'].pct_change() * 100\n",
"\n",
"# Remove any rows with NaN values created by lagging or rolling window calculations\n",
"df_dummy.dropna(inplace=True)\n",
"\n",
"# Display the final DataFrame with all time-based features\n",
"df_dummy.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>start_date</th>\n",
" <th>end_date</th>\n",
" <th>open_items</th>\n",
" <th>red_flags</th>\n",
" <th>num_employees</th>\n",
" <th>assessment_type_biweekly</th>\n",
" <th>assessment_type_quarterly</th>\n",
" <th>assessment_type_weekly</th>\n",
" <th>open_items_assessment_type_weekly_lag_1</th>\n",
" <th>open_items_assessment_type_biweekly_lag_1</th>\n",
" <th>open_items_assessment_type_quarterly_lag_1</th>\n",
" <th>open_items_weekly_ma_3</th>\n",
" <th>open_items_biweekly_ma_3</th>\n",
" <th>open_items_quarterly_ma_3</th>\n",
" <th>time_since_last_event</th>\n",
" <th>percentage_change_open_items</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-01-01</td>\n",
" <td>2023-01-02</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" <td>30</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-01-08</td>\n",
" <td>2023-01-09</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>25</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>10.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>12.0</td>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>20.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-01-15</td>\n",
" <td>2023-01-16</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>12.0</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>12.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>-8.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-01-22</td>\n",
" <td>2023-01-23</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>30</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>11.0</td>\n",
" <td>9.0</td>\n",
" <td>12.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>-18.181818</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-01-29</td>\n",
" <td>2023-01-30</td>\n",
" <td>13</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>9.0</td>\n",
" <td>13.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>44.444444</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" start_date end_date open_items red_flags num_employees \\\n",
"0 2023-01-01 2023-01-02 10 2 30 \n",
"1 2023-01-08 2023-01-09 12 1 25 \n",
"2 2023-01-15 2023-01-16 11 3 28 \n",
"3 2023-01-22 2023-01-23 9 1 30 \n",
"4 2023-01-29 2023-01-30 13 4 27 \n",
"\n",
" assessment_type_biweekly assessment_type_quarterly \\\n",
"0 0 0 \n",
"1 1 0 \n",
"2 0 1 \n",
"3 0 0 \n",
"4 1 0 \n",
"\n",
" assessment_type_weekly open_items_assessment_type_weekly_lag_1 \\\n",
"0 1 0.0 \n",
"1 0 10.0 \n",
"2 0 0.0 \n",
"3 1 0.0 \n",
"4 0 9.0 \n",
"\n",
" open_items_assessment_type_biweekly_lag_1 \\\n",
"0 0.0 \n",
"1 0.0 \n",
"2 12.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"\n",
" open_items_assessment_type_quarterly_lag_1 open_items_weekly_ma_3 \\\n",
"0 0.0 10.0 \n",
"1 0.0 10.0 \n",
"2 0.0 10.0 \n",
"3 11.0 9.0 \n",
"4 0.0 9.0 \n",
"\n",
" open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n",
"0 0.0 0.0 0.0 \n",
"1 12.0 0.0 7.0 \n",
"2 12.0 11.0 7.0 \n",
"3 12.0 11.0 7.0 \n",
"4 13.0 11.0 7.0 \n",
"\n",
" percentage_change_open_items \n",
"0 0.000000 \n",
"1 20.000000 \n",
"2 -8.333333 \n",
"3 -18.181818 \n",
"4 44.444444 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"class DataPreprocessor:\n",
" def __init__(self, input_path, output_dir):\n",
" self.input_path = input_path\n",
" self.output_dir = output_dir\n",
" self.df = None\n",
"\n",
" def load_data(self):\n",
" self.df = pd.read_csv(self.input_path)\n",
"\n",
" def preprocess(self):\n",
" # Convert 'assessment_type' to categorical (one-hot encoding)\n",
" self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)\n",
"\n",
" # Convert boolean columns to 1s and 0s\n",
" self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)\n",
" self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)\n",
" self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)\n",
"\n",
" # Function to create lagged features based on assessment type\n",
" def create_lagged_features(df, col, assessment_col):\n",
" lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
" df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n",
" return df\n",
"\n",
" # Create lagged features for each assessment type\n",
" self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')\n",
" self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')\n",
" self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')\n",
"\n",
" # Fill NaNs with 0 instead of dropping rows\n",
" self.df.fillna(0, inplace=True)\n",
"\n",
" # Create moving averages for each assessment type\n",
" self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
" self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
" self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
"\n",
" # Add time since last event (days between assessments)\n",
" self.df['start_date'] = pd.to_datetime(self.df['start_date'])\n",
" self.df['time_since_last_event'] = self.df['start_date'].diff().dt.days.fillna(0)\n",
"\n",
" # Add percentage change in open items\n",
" self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100\n",
"\n",
" def save_data(self):\n",
" output_path = os.path.join(self.output_dir, 'preprocessed_data.csv')\n",
" self.df.to_csv(output_path, index=False)\n",
" return output_path\n",
"\n",
" def run(self):\n",
" self.load_data()\n",
" self.preprocess()\n",
" return self.save_data()\n",
"\n",
"\n",
"preprocessor = DataPreprocessor('path/to/input.csv', 'path/to/output/directory')\n",
"output_file = preprocessor.run()\n",
"# print(f\"Preprocessed data saved to: {output_file}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>start_date</th>\n",
" <th>end_date</th>\n",
" <th>open_items</th>\n",
" <th>red_flags</th>\n",
" <th>num_employees</th>\n",
" <th>assessment_type_biweekly</th>\n",
" <th>assessment_type_quarterly</th>\n",
" <th>assessment_type_weekly</th>\n",
" <th>open_items_assessment_type_weekly_lag_1</th>\n",
" <th>open_items_assessment_type_biweekly_lag_1</th>\n",
" <th>open_items_assessment_type_quarterly_lag_1</th>\n",
" <th>open_items_weekly_ma_3</th>\n",
" <th>open_items_biweekly_ma_3</th>\n",
" <th>open_items_quarterly_ma_3</th>\n",
" <th>time_since_last_event</th>\n",
" <th>percentage_change_open_items</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-01-15</td>\n",
" <td>2023-01-16</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>12.0</td>\n",
" <td>0.0</td>\n",
" <td>10.0</td>\n",
" <td>12.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>-8.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-01-22</td>\n",
" <td>2023-01-23</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>30</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>11.0</td>\n",
" <td>9.0</td>\n",
" <td>12.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>-18.181818</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-01-29</td>\n",
" <td>2023-01-30</td>\n",
" <td>13</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>9.0</td>\n",
" <td>13.0</td>\n",
" <td>11.0</td>\n",
" <td>7.0</td>\n",
" <td>44.444444</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" start_date end_date open_items red_flags num_employees \\\n",
"2 2023-01-15 2023-01-16 11 3 28 \n",
"3 2023-01-22 2023-01-23 9 1 30 \n",
"4 2023-01-29 2023-01-30 13 4 27 \n",
"\n",
" assessment_type_biweekly assessment_type_quarterly \\\n",
"2 False True \n",
"3 False False \n",
"4 True False \n",
"\n",
" assessment_type_weekly open_items_assessment_type_weekly_lag_1 \\\n",
"2 False 0.0 \n",
"3 True 0.0 \n",
"4 False 9.0 \n",
"\n",
" open_items_assessment_type_biweekly_lag_1 \\\n",
"2 12.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"\n",
" open_items_assessment_type_quarterly_lag_1 open_items_weekly_ma_3 \\\n",
"2 0.0 10.0 \n",
"3 11.0 9.0 \n",
"4 0.0 9.0 \n",
"\n",
" open_items_biweekly_ma_3 open_items_quarterly_ma_3 time_since_last_event \\\n",
"2 12.0 11.0 7.0 \n",
"3 12.0 11.0 7.0 \n",
"4 13.0 11.0 7.0 \n",
"\n",
" percentage_change_open_items \n",
"2 -8.333333 \n",
"3 -18.181818 \n",
"4 44.444444 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>start_date</th>\n",
" <th>end_date</th>\n",
" <th>open_items</th>\n",
" <th>red_flags</th>\n",
" <th>num_employees</th>\n",
" <th>assessment_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2023-01-01</td>\n",
" <td>2023-01-02</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" <td>30</td>\n",
" <td>weekly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2023-01-08</td>\n",
" <td>2023-01-09</td>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>25</td>\n",
" <td>biweekly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2023-01-15</td>\n",
" <td>2023-01-16</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>28</td>\n",
" <td>quarterly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2023-01-22</td>\n",
" <td>2023-01-23</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>30</td>\n",
" <td>weekly</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2023-01-29</td>\n",
" <td>2023-01-30</td>\n",
" <td>13</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>biweekly</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" start_date end_date open_items red_flags num_employees assessment_type\n",
"0 2023-01-01 2023-01-02 10 2 30 weekly\n",
"1 2023-01-08 2023-01-09 12 1 25 biweekly\n",
"2 2023-01-15 2023-01-16 11 3 28 quarterly\n",
"3 2023-01-22 2023-01-23 9 1 30 weekly\n",
"4 2023-01-29 2023-01-30 13 4 27 biweekly"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dummy.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Create a dummy dataset with past 5 assessments\n",
"data_dummy = {\n",
" 'start_date': pd.date_range(start='2023-01-01', periods=5, freq='7D'),\n",
" 'end_date': pd.date_range(start='2023-01-02', periods=5, freq='7D'),\n",
" 'open_items': [10, 12, 11, 9, 13],\n",
" 'red_flags': [2, 1, 3, 1, 4],\n",
" 'num_employees': [30, 25, 28, 30, 27],\n",
" 'assessment_type': ['weekly', 'biweekly', 'quarterly', 'weekly', 'biweekly']\n",
"}\n",
"\n",
"df = pd.DataFrame(data_dummy)\n",
"\n",
"# Convert 'assessment_type' to categorical (one-hot encoding)\n",
"df = pd.get_dummies(df, columns=['assessment_type'], drop_first=False)\n",
"\n",
"# Convert boolean columns to 1s and 0s\n",
"df['assessment_type_weekly'] = df['assessment_type_weekly'].astype(int)\n",
"df['assessment_type_biweekly'] = df['assessment_type_biweekly'].astype(int)\n",
"df['assessment_type_quarterly'] = df['assessment_type_quarterly'].astype(int)\n",
"\n",
"# Function to create lagged features based on assessment type\n",
"def create_lagged_features(df, col, assessment_col):\n",
" lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
" df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n",
" return df\n",
"\n",
"# Create lagged features for each assessment type\n",
"df = create_lagged_features(df, 'open_items', 'assessment_type_weekly')\n",
"df = create_lagged_features(df, 'open_items', 'assessment_type_biweekly')\n",
"df = create_lagged_features(df, 'open_items', 'assessment_type_quarterly')\n",
"\n",
"# Fill NaNs with 0 instead of dropping rows\n",
"df.fillna(0, inplace=True)\n",
"\n",
"# Create moving averages for each assessment type\n",
"df['open_items_weekly_ma_3'] = df['open_items'].where(df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
"df['open_items_biweekly_ma_3'] = df['open_items'].where(df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
"df['open_items_quarterly_ma_3'] = df['open_items'].where(df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
"\n",
"# Add time since last event (days between assessments)\n",
"df['start_date'] = pd.to_datetime(df['start_date'])\n",
"df['time_since_last_event'] = df['start_date'].diff().dt.days.fillna(0)\n",
"\n",
"# Add percentage change in open items\n",
"df['percentage_change_open_items'] = df['open_items'].pct_change().fillna(0) * 100\n",
"\n",
"# Display the final DataFrame with all time-based features\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"class DataPreprocessor:\n",
" def __init__(self, input_path, company_id):\n",
" self.input_path = input_path\n",
" self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id)\n",
" self.company_id = company_id\n",
" self.df = None\n",
"\n",
" def load_data(self):\n",
" self.df = pd.read_csv(self.input_path)\n",
"\n",
" def preprocess(self):\n",
" # Convert 'start_date' and 'end_date' to datetime\n",
" self.df['start_date'] = pd.to_datetime(self.df['start_date'])\n",
" self.df['end_date'] = pd.to_datetime(self.df['end_date'])\n",
"\n",
" # Add duration (in days) by subtracting start_date from end_date\n",
" self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days\n",
"\n",
" # Drop the 'start_date' and 'end_date' columns as they are not needed for training\n",
" self.df.drop(columns=['start_date', 'end_date'], inplace=True)\n",
"\n",
" # Convert 'assessment_type' to categorical (one-hot encoding)\n",
" self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)\n",
"\n",
" # Convert boolean columns to 1s and 0s\n",
" self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)\n",
" self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)\n",
" self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)\n",
"\n",
" # Function to create lagged features based on assessment type\n",
" def create_lagged_features(df, col, assessment_col):\n",
" lagged_col = f\"{col}_{assessment_col}_lag_1\"\n",
" df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)\n",
" return df\n",
"\n",
" # Create lagged features for each assessment type\n",
" self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')\n",
" self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')\n",
" self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')\n",
"\n",
" # Fill NaNs with 0 instead of dropping rows\n",
" self.df.fillna(0, inplace=True)\n",
"\n",
" # Create moving averages for each assessment type\n",
" self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
" self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
" self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)\n",
"\n",
" # Add percentage change in open items\n",
" self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100\n",
"\n",
" def save_data(self):\n",
" os.makedirs(self.output_dir, exist_ok=True) # Ensure output directory exists\n",
" output_path = os.path.join(self.output_dir, 'output.csv')\n",
" self.df.to_csv(output_path, index=False)\n",
" return output_path\n",
"\n",
" def run(self):\n",
" self.load_data()\n",
" self.preprocess()\n",
" return self.save_data()\n",
"\n",
"# Example usage:\n",
"# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123')\n",
"# processed_data_path = preprocessor.run()\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"dp = DataPreprocessor(\n",
" input_path=\"/root/ds_erp_ai/data/raw/dummy_assessment_data.csv\",\n",
" company_id=\"testid\"\n",
")\n",
"\n",
"\n",
"res = dp.run()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'data/processed/assessment_prediction/testid/output.csv'"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model saved to models/assessment_prediction/testid/testid_model.pkl\n",
"Latest assessment data saved to models/assessment_prediction/testid/testid_latest_data.csv\n",
"Model Evaluation Metrics:\n",
"Mean Absolute Error (MAE): 1.3099999999999996\n",
"Mean Squared Error (MSE): 2.3089999999999997\n",
"R-squared (R²): nan\n",
"The model was saved at: models/assessment_prediction/testid/testid_model.pkl\n",
"The latest data was saved at: models/assessment_prediction/testid/testid_latest_data.csv\n",
"Evaluation Results: {'mae': 1.3099999999999996, 'mse': 2.3089999999999997, 'r2': nan}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/root/ds_erp_ai/erp/lib/python3.10/site-packages/sklearn/metrics/_regression.py:1211: UndefinedMetricWarning: R^2 score is not well-defined with less than two samples.\n",
" warnings.warn(msg, UndefinedMetricWarning)\n"
]
}
],
"source": [
"import pandas as pd\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.multioutput import MultiOutputRegressor\n",
"import joblib\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"class ModelTrainer:\n",
" def __init__(self, preprocessed_data_path, company_id, model):\n",
" self.preprocessed_data_path = preprocessed_data_path\n",
" self.output_dir = os.path.join('models', 'assessment_prediction', company_id)\n",
" self.company_id = company_id\n",
" self.df = None\n",
" self.model = model # Model passed as an argument\n",
" self.X_test = None\n",
" self.y_test = None\n",
"\n",
" def load_data(self):\n",
" self.df = pd.read_csv(self.preprocessed_data_path)\n",
"\n",
" def train_model(self):\n",
" # Split data into features (X) and target variables (y)\n",
" X = self.df.drop(columns=['open_items', 'red_flags'])\n",
" y = self.df[['open_items', 'red_flags']] # Multi-target for open items and red flags\n",
"\n",
" # Split into training and test sets with 10% as test size\n",
" X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42)\n",
"\n",
" # Train the model\n",
" self.model.fit(X_train, y_train)\n",
"\n",
" # Save the trained model\n",
" os.makedirs(self.output_dir, exist_ok=True)\n",
" model_path = os.path.join(self.output_dir, f'{self.company_id}_model.pkl')\n",
" joblib.dump(self.model, model_path)\n",
" print(f\"Model saved to {model_path}\")\n",
"\n",
" # Save the latest row (last assessment data) for inference\n",
" latest_data_path = os.path.join(self.output_dir, f'{self.company_id}_latest_data.csv')\n",
" self.df.tail(1).to_csv(latest_data_path, index=False)\n",
" print(f\"Latest assessment data saved to {latest_data_path}\")\n",
"\n",
" # Return the model path and latest data path\n",
" return model_path, latest_data_path\n",
"\n",
" def evaluate_model(self):\n",
" # Predict using the test data\n",
" y_pred = self.model.predict(self.X_test)\n",
"\n",
" # Calculate evaluation metrics\n",
" mae = mean_absolute_error(self.y_test, y_pred)\n",
" mse = mean_squared_error(self.y_test, y_pred)\n",
" r2 = r2_score(self.y_test, y_pred)\n",
"\n",
" print(\"Model Evaluation Metrics:\")\n",
" print(f\"Mean Absolute Error (MAE): {mae}\")\n",
" print(f\"Mean Squared Error (MSE): {mse}\")\n",
" print(f\"R-squared (R²): {r2}\")\n",
"\n",
" # Return evaluation results\n",
" return {'mae': mae, 'mse': mse, 'r2': r2}\n",
"\n",
" def run(self):\n",
" # Load data and train the model\n",
" self.load_data()\n",
" model_path, latest_data_path = self.train_model()\n",
"\n",
" # Evaluate the model immediately after training\n",
" evaluation_results = self.evaluate_model()\n",
"\n",
" return model_path, latest_data_path, evaluation_results\n",
"\n",
"# Example usage\n",
"model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))\n",
"trainer = ModelTrainer(preprocessed_data_path=res, company_id='testid', model=model)\n",
"model_path, latest_data_path, evaluation_results = trainer.run()\n",
"print(f\"The model was saved at: {model_path}\")\n",
"print(f\"The latest data was saved at: {latest_data_path}\")\n",
"print(f\"Evaluation Results: {evaluation_results}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'data/processed/assessment_prediction/testid/output.csv'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'models/assessment_prediction/testid/testid_model.pkl'"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"e"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model loaded from models/assessment_prediction/testid/testid_model.pkl\n",
"Latest data loaded from models/assessment_prediction/testid/testid_latest_data.csv\n",
"\n",
"Forecasting assessment 1/5\n",
"\n",
"Forecasting assessment 2/5\n",
"\n",
"Forecasting assessment 3/5\n",
"\n",
"Forecasting assessment 4/5\n",
"\n",
"Forecasting assessment 5/5\n",
"[{'forecast_step': 1, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 2, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 3, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 4, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}, {'forecast_step': 5, 'weekly': {'assessment_type': 'weekly', 'open_items': 11, 'red_flags': 3}, 'biweekly': {'assessment_type': 'biweekly', 'open_items': 12, 'red_flags': 3}, 'quarterly': {'assessment_type': 'quarterly', 'open_items': 12, 'red_flags': 3}}]\n"
]
}
],
"source": [
"import pandas as pd\n",
"import joblib\n",
"import os\n",
"\n",
"class AssessmentInference:\n",
" def __init__(self, company_id, num_assessments, model_dir='models'):\n",
" self.company_id = company_id\n",
" self.num_assessments = num_assessments\n",
" self.model_dir = model_dir\n",
" self.model = None\n",
" self.latest_data = None\n",
"\n",
" def load_model(self):\n",
" # Load the trained model\n",
" model_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_model.pkl')\n",
" self.model = joblib.load(model_path)\n",
" print(f\"Model loaded from {model_path}\")\n",
"\n",
" def load_latest_data(self):\n",
" # Load the latest assessment data\n",
" latest_data_path = os.path.join(self.model_dir, 'assessment_prediction', self.company_id, f'{self.company_id}_latest_data.csv')\n",
" self.latest_data = pd.read_csv(latest_data_path)\n",
" print(f\"Latest data loaded from {latest_data_path}\")\n",
"\n",
" def predict_next_assessment(self, current_data, assessment_type):\n",
" # Update assessment type (weekly, biweekly, quarterly) in the data for prediction\n",
" current_data['assessment_type_weekly'] = 1 if assessment_type == 'weekly' else 0\n",
" current_data['assessment_type_biweekly'] = 1 if assessment_type == 'biweekly' else 0\n",
" current_data['assessment_type_quarterly'] = 1 if assessment_type == 'quarterly' else 0\n",
"\n",
" # Exclude target variables (open_items, red_flags) from the feature set\n",
" features = current_data.drop(columns=['open_items', 'red_flags'])\n",
"\n",
" # Predict the next open items and red flags\n",
" prediction = self.model.predict(features)\n",
" open_items_pred, red_flags_pred = prediction[0]\n",
"\n",
" # Ensure the predictions are integers by rounding\n",
" open_items_pred = int(round(open_items_pred))\n",
" red_flags_pred = int(round(red_flags_pred))\n",
"\n",
" return {\n",
" 'assessment_type': assessment_type,\n",
" 'open_items': open_items_pred,\n",
" 'red_flags': red_flags_pred\n",
" }\n",
"\n",
" def predict_next_assessments(self):\n",
" predictions = []\n",
" current_data = self.latest_data.copy()\n",
"\n",
" # Iteratively forecast the next assessments\n",
" for i in range(self.num_assessments):\n",
" print(f\"\\nForecasting assessment {i + 1}/{self.num_assessments}\")\n",
"\n",
" # Predict for weekly, biweekly, and quarterly for the same forecast step\n",
" weekly_prediction = self.predict_next_assessment(current_data, 'weekly')\n",
" biweekly_prediction = self.predict_next_assessment(current_data, 'biweekly')\n",
" quarterly_prediction = self.predict_next_assessment(current_data, 'quarterly')\n",
"\n",
" # Append predictions for all types in one forecast step\n",
" predictions.append({\n",
" 'forecast_step': i + 1,\n",
" 'weekly': weekly_prediction,\n",
" 'biweekly': biweekly_prediction,\n",
" 'quarterly': quarterly_prediction\n",
" })\n",
"\n",
" # Update the current data with the weekly prediction (or any of the predictions) for the next step\n",
" current_data['open_items'] = weekly_prediction['open_items']\n",
" current_data['red_flags'] = weekly_prediction['red_flags']\n",
"\n",
" return predictions\n",
"\n",
" def run(self):\n",
" self.load_model()\n",
" self.load_latest_data()\n",
" predictions = self.predict_next_assessments()\n",
" return predictions\n",
"\n",
"\n",
"# Example usage\n",
"inference = AssessmentInference(company_id='testid', num_assessments=5)\n",
"predictions = inference.run()\n",
"print(predictions)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "erp",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}