run testes on assessments predictions pipeline

2024-09-12 00:01:03 +00:00
parent 007c0d81dc
commit 24823432b3
14 changed files with 254 additions and 16 deletions
@@ -0,0 +1,68 @@
+import pandas as pd
+import os
+
+class DataPreprocessor:
+    def __init__(self, input_path, company_id):
+        self.input_path = input_path
+        self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id)
+        self.company_id = company_id
+        self.df = None
+
+    def load_data(self):
+        self.df = pd.read_csv(self.input_path)
+
+    def preprocess(self):
+        # Convert 'start_date' and 'end_date' to datetime
+        self.df['start_date'] = pd.to_datetime(self.df['start_date'])
+        self.df['end_date'] = pd.to_datetime(self.df['end_date'])
+
+        # Add duration (in days) by subtracting start_date from end_date
+        self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days
+
+        # Drop the 'start_date' and 'end_date' columns as they are not needed for training
+        self.df.drop(columns=['start_date', 'end_date'], inplace=True)
+
+        # Convert 'assessment_type' to categorical (one-hot encoding)
+        self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)
+
+        # Convert boolean columns to 1s and 0s
+        self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)
+        self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)
+        self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)
+
+        # Function to create lagged features based on assessment type
+        def create_lagged_features(df, col, assessment_col):
+            lagged_col = f"{col}_{assessment_col}_lag_1"
+            df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)
+            return df
+
+        # Create lagged features for each assessment type
+        self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')
+        self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')
+        self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')
+
+        # Fill NaNs with 0 instead of dropping rows
+        self.df.fillna(0, inplace=True)
+
+        # Create moving averages for each assessment type
+        self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)
+        self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)
+        self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)
+
+        # Add percentage change in open items
+        self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100
+
+    def save_data(self):
+        os.makedirs(self.output_dir, exist_ok=True)  # Ensure output directory exists
+        output_path = os.path.join(self.output_dir, 'output.csv')
+        self.df.to_csv(output_path, index=False)
+        return output_path
+
+    def run(self):
+        self.load_data()
+        self.preprocess()
+        return self.save_data()
+
+# Example usage:
+# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123')
+# processed_data_path = preprocessor.run()
@@ -0,0 +1,88 @@
+import pandas as pd
+import os
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.multioutput import MultiOutputRegressor
+import joblib
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+import logging
+from logging.handlers import RotatingFileHandler
+
+
+handler = RotatingFileHandler('/root/ds_erp_ai/logs/prediction_pipeline.log', maxBytes=100000, backupCount=3)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(handler)
+
+class ModelTrainer:
+    def __init__(self, preprocessed_data_path, company_id, model):
+        self.preprocessed_data_path = preprocessed_data_path
+        self.output_dir = os.path.join('models', 'assessment_prediction', company_id)
+        self.company_id = company_id
+        self.df = None
+        self.model = model  # Model passed as an argument
+        self.X_test = None
+        self.y_test = None
+
+    def load_data(self):
+        self.df = pd.read_csv(self.preprocessed_data_path)
+
+    def train_model(self):
+        # Split data into features (X) and target variables (y)
+        X = self.df.drop(columns=['open_items', 'red_flags'])
+        y = self.df[['open_items', 'red_flags']]  # Multi-target for open items and red flags
+
+        # Split into training and test sets with 10% as test size
+        X_train, self.X_test, y_train, self.y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+
+        # Train the model
+        self.model.fit(X_train, y_train)
+
+        # Save the trained model
+        os.makedirs(self.output_dir, exist_ok=True)
+        model_path = os.path.join(self.output_dir, f'{self.company_id}_model.pkl')
+        joblib.dump(self.model, model_path)
+        print(f"Model saved to {model_path}")
+
+        # Save the latest row (last assessment data) for inference
+        latest_data_path = os.path.join(self.output_dir, f'{self.company_id}_latest_data.csv')
+        self.df.tail(1).to_csv(latest_data_path, index=False)
+        print(f"Latest assessment data saved to {latest_data_path}")
+
+        # Return the model path and latest data path
+        return model_path, latest_data_path
+
+    def evaluate_model(self):
+        # Predict using the test data
+        y_pred = self.model.predict(self.X_test)
+
+        # Calculate evaluation metrics
+        mae = mean_absolute_error(self.y_test, y_pred)
+        mse = mean_squared_error(self.y_test, y_pred)
+        r2 = r2_score(self.y_test, y_pred)
+
+        print("Model Evaluation Metrics:")
+        print(f"Mean Absolute Error (MAE): {mae}")
+        print(f"Mean Squared Error (MSE): {mse}")
+        print(f"R-squared (R²): {r2}")
+
+        # Return evaluation results
+        return {'mae': mae, 'mse': mse, 'r2': r2}
+
+    def run(self):
+        # Load data and train the model
+        self.load_data()
+        model_path, latest_data_path = self.train_model()
+
+        # Evaluate the model immediately after training
+        evaluation_results = self.evaluate_model()
+
+        return model_path, latest_data_path, evaluation_results
+
+# Example usage
+'''model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
+trainer = ModelTrainer(preprocessed_data_path=res, company_id='testid', model=model)
+model_path, latest_data_path, evaluation_results = trainer.run()
+print(f"The model was saved at: {model_path}")
+print(f"The latest data was saved at: {latest_data_path}")
+print(f"Evaluation Results: {evaluation_results}")'''