import pandas as pd import os import logging from logging.handlers import RotatingFileHandler class DataPreprocessor: def __init__(self, input_path, company_id): self.input_path = input_path self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id) self.company_id = company_id self.df = None def load_data(self): self.df = pd.read_csv(self.input_path) def preprocess(self): # Convert 'start_date' and 'end_date' to datetime self.df['start_date'] = pd.to_datetime(self.df['start_date']) self.df['end_date'] = pd.to_datetime(self.df['end_date']) # Add duration (in days) by subtracting start_date from end_date self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days # Drop the 'start_date' and 'end_date' columns as they are not needed for training self.df.drop(columns=['start_date', 'end_date'], inplace=True) # Convert 'assessment_type' to categorical (one-hot encoding) self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False) # Convert boolean columns to 1s and 0s self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int) self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int) self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int) # Function to create lagged features based on assessment type def create_lagged_features(df, col, assessment_col): lagged_col = f"{col}_{assessment_col}_lag_1" df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1) return df # Create lagged features for each assessment type self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly') self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly') self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly') # Fill NaNs with 0 instead of dropping rows self.df.fillna(0, inplace=True) # Create moving averages for each assessment type self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0) self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0) self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0) # Add percentage change in open items self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100 def save_data(self): os.makedirs(self.output_dir, exist_ok=True) # Ensure output directory exists output_path = os.path.join(self.output_dir, 'output.csv') self.df.to_csv(output_path, index=False) return output_path def run(self): self.load_data() self.preprocess() return self.save_data() # Example usage: # preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123') # processed_data_path = preprocessor.run()