src/pipeline/data_preprocessor.py

import pandas as pd
import os

class DataPreprocessor:
    def __init__(self, input_path, company_id):
        self.input_path = input_path
        self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id)
        self.company_id = company_id
        self.df = None

    def load_data(self):
        self.df = pd.read_csv(self.input_path)

    def preprocess(self):
        # Convert 'start_date' and 'end_date' to datetime
        self.df['start_date'] = pd.to_datetime(self.df['start_date'])
        self.df['end_date'] = pd.to_datetime(self.df['end_date'])

        # Add duration (in days) by subtracting start_date from end_date
        self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days

        # Drop the 'start_date' and 'end_date' columns as they are not needed for training
        self.df.drop(columns=['start_date', 'end_date'], inplace=True)

        # Convert 'assessment_type' to categorical (one-hot encoding)
        self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)

        # Convert boolean columns to 1s and 0s
        self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)
        self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)
        self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)

        # Function to create lagged features based on assessment type
        def create_lagged_features(df, col, assessment_col):
            lagged_col = f"{col}_{assessment_col}_lag_1"
            df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)
            return df

        # Create lagged features for each assessment type
        self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')
        self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')
        self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')

        # Fill NaNs with 0 instead of dropping rows
        self.df.fillna(0, inplace=True)

        # Create moving averages for each assessment type
        self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)
        self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)
        self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)

        # Add percentage change in open items
        self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100

    def save_data(self):
        os.makedirs(self.output_dir, exist_ok=True)  # Ensure output directory exists
        output_path = os.path.join(self.output_dir, 'output.csv')
        self.df.to_csv(output_path, index=False)
        return output_path

    def run(self):
        self.load_data()
        self.preprocess()
        return self.save_data()

# Example usage:
# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123')
# processed_data_path = preprocessor.run()
run testes on assessments predictions pipeline 2024-09-12 00:01:03 +00:00			`import pandas as pd`
			`import os`

			`class DataPreprocessor:`
			`def __init__(self, input_path, company_id):`
			`self.input_path = input_path`
			`self.output_dir = os.path.join('data', 'processed', 'assessment_prediction', company_id)`
			`self.company_id = company_id`
			`self.df = None`

			`def load_data(self):`
			`self.df = pd.read_csv(self.input_path)`

			`def preprocess(self):`
			`# Convert 'start_date' and 'end_date' to datetime`
			`self.df['start_date'] = pd.to_datetime(self.df['start_date'])`
			`self.df['end_date'] = pd.to_datetime(self.df['end_date'])`

			`# Add duration (in days) by subtracting start_date from end_date`
			`self.df['duration'] = (self.df['end_date'] - self.df['start_date']).dt.days`

			`# Drop the 'start_date' and 'end_date' columns as they are not needed for training`
			`self.df.drop(columns=['start_date', 'end_date'], inplace=True)`

			`# Convert 'assessment_type' to categorical (one-hot encoding)`
			`self.df = pd.get_dummies(self.df, columns=['assessment_type'], drop_first=False)`

			`# Convert boolean columns to 1s and 0s`
			`self.df['assessment_type_weekly'] = self.df['assessment_type_weekly'].astype(int)`
			`self.df['assessment_type_biweekly'] = self.df['assessment_type_biweekly'].astype(int)`
			`self.df['assessment_type_quarterly'] = self.df['assessment_type_quarterly'].astype(int)`

			`# Function to create lagged features based on assessment type`
			`def create_lagged_features(df, col, assessment_col):`
			`lagged_col = f"{col}_{assessment_col}_lag_1"`
			`df[lagged_col] = df[col].where(df[assessment_col] == 1).shift(1)`
			`return df`

			`# Create lagged features for each assessment type`
			`self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_weekly')`
			`self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_biweekly')`
			`self.df = create_lagged_features(self.df, 'open_items', 'assessment_type_quarterly')`

			`# Fill NaNs with 0 instead of dropping rows`
			`self.df.fillna(0, inplace=True)`

			`# Create moving averages for each assessment type`
			`self.df['open_items_weekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_weekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)`
			`self.df['open_items_biweekly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_biweekly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)`
			`self.df['open_items_quarterly_ma_3'] = self.df['open_items'].where(self.df['assessment_type_quarterly'] == 1).rolling(window=3, min_periods=1).mean().fillna(0)`

			`# Add percentage change in open items`
			`self.df['percentage_change_open_items'] = self.df['open_items'].pct_change().fillna(0) * 100`

			`def save_data(self):`
			`os.makedirs(self.output_dir, exist_ok=True) # Ensure output directory exists`
			`output_path = os.path.join(self.output_dir, 'output.csv')`
			`self.df.to_csv(output_path, index=False)`
			`return output_path`

			`def run(self):`
			`self.load_data()`
			`self.preprocess()`
			`return self.save_data()`

			`# Example usage:`
			`# preprocessor = DataPreprocessor(input_path='path_to_raw_data.csv', company_id='company_123')`
			`# processed_data_path = preprocessor.run()`