import os from docx import Document from typing import Dict, List from .config import Config from .embeddings import EmbeddingGenerator class ComplianceLoader: def __init__(self): self.embedding_generator = EmbeddingGenerator() self.compliance_docs = {} self.compliance_embeddings = {} def load_compliance_standards(self, data_folder: str = "data/"): """Load all compliance documents and generate embeddings""" compliance_files = [ "Invitation to Tender.docx", "Tender Specifications.docx", "Bill of Quantities.docx", "Scope of Work.docx", "Supplier SQualification requirements.docx", "form of tender.docx", "confidentiality agreement.docx", "Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx" ] for filename in compliance_files: file_path = os.path.join(data_folder, filename) if os.path.exists(file_path): try: # Extract text from compliance document doc = Document(file_path) text = '\n'.join([para.text for para in doc.paragraphs]) # Store text and generate embedding doc_key = filename.replace('.docx', '').replace(' ', '_').lower() self.compliance_docs[doc_key] = { 'filename': filename, 'content': text, 'sections': self._extract_sections(text) } # Generate embedding for similarity search self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text) print(f"Loaded compliance standard: {filename}") except Exception as e: print(f"Error loading {filename}: {str(e)}") def _extract_sections(self, text: str) -> List[str]: """Extract key sections from compliance documents""" sections = [] lines = text.split('\n') current_section = [] for line in lines: line = line.strip() if line and (line.isupper() or line.endswith(':') or any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])): if current_section: sections.append('\n'.join(current_section)) current_section = [line] elif line: current_section.append(line) if current_section: sections.append('\n'.join(current_section)) return sections def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]: """Find relevant compliance standards for a document""" relevant_standards = [] for doc_key, compliance_embedding in self.compliance_embeddings.items(): # Calculate similarity relevant_standards.append({ 'standard': doc_key, 'filename': self.compliance_docs[doc_key]['filename'], 'content': self.compliance_docs[doc_key]['content'], 'sections': self.compliance_docs[doc_key]['sections'] }) return relevant_standards def get_compliance_context(self) -> str: """Get formatted compliance context for LLM prompts""" context = "COMPLIANCE STANDARDS:\n\n" for doc_key, doc_data in self.compliance_docs.items(): context += f"=== {doc_data['filename']} ===\n" context += f"{doc_data['content'][:1000]}...\n\n" return context