ds_task_scp/backend/compliance_loader.py

import os
from docx import Document
from typing import Dict, List
from .config import Config
from .embeddings import EmbeddingGenerator


class ComplianceLoader:
    def __init__(self):
        self.embedding_generator = EmbeddingGenerator()
        self.compliance_docs = {}
        self.compliance_embeddings = {}

    def load_compliance_standards(self, data_folder: str = "data/"):
        """Load all compliance documents and generate embeddings"""
        compliance_files = [
            "Invitation to Tender.docx",
            "Tender Specifications.docx",
            "Bill of Quantities.docx",
            "Scope of Work.docx",
            "Supplier SQualification requirements.docx",
            "form of tender.docx",
            "confidentiality agreement.docx",
            "Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx"
        ]

        for filename in compliance_files:
            file_path = os.path.join(data_folder, filename)
            if os.path.exists(file_path):
                try:
                    # Extract text from compliance document
                    doc = Document(file_path)
                    text = '\n'.join([para.text for para in doc.paragraphs])

                    # Store text and generate embedding
                    doc_key = filename.replace('.docx', '').replace(' ', '_').lower()
                    self.compliance_docs[doc_key] = {
                        'filename': filename,
                        'content': text,
                        'sections': self._extract_sections(text)
                    }

                    # Generate embedding for similarity search
                    self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text)

                    print(f"Loaded compliance standard: {filename}")

                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

    def _extract_sections(self, text: str) -> List[str]:
        """Extract key sections from compliance documents"""
        sections = []
        lines = text.split('\n')
        current_section = []

        for line in lines:
            line = line.strip()
            if line and (line.isupper() or line.endswith(':') or
                        any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])):
                if current_section:
                    sections.append('\n'.join(current_section))
                current_section = [line]
            elif line:
                current_section.append(line)

        if current_section:
            sections.append('\n'.join(current_section))

        return sections

    def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]:
        """Find relevant compliance standards for a document"""
        relevant_standards = []

        for doc_key, compliance_embedding in self.compliance_embeddings.items():
            # Calculate similarity
            relevant_standards.append({
                'standard': doc_key,
                'filename': self.compliance_docs[doc_key]['filename'],
                'content': self.compliance_docs[doc_key]['content'],
                'sections': self.compliance_docs[doc_key]['sections']
            })

        return relevant_standards

    def get_compliance_context(self) -> str:
        """Get formatted compliance context for LLM prompts"""
        context = "COMPLIANCE STANDARDS:\n\n"

        for doc_key, doc_data in self.compliance_docs.items():
            context += f"=== {doc_data['filename']} ===\n"
            context += f"{doc_data['content'][:1000]}...\n\n"

        return context