update functions

2025-07-14 23:41:31 +01:00
parent 0b5a7218b0
commit 97a3b710c3
7 changed files with 580 additions and 75 deletions
@@ -0,0 +1,95 @@
+import os
+from docx import Document
+from typing import Dict, List
+from .config import Config
+from .embeddings import EmbeddingGenerator
+
+
+class ComplianceLoader:
+    def __init__(self):
+        self.embedding_generator = EmbeddingGenerator()
+        self.compliance_docs = {}
+        self.compliance_embeddings = {}
+
+    def load_compliance_standards(self, data_folder: str = "data/"):
+        """Load all compliance documents and generate embeddings"""
+        compliance_files = [
+            "Invitation to Tender.docx",
+            "Tender Specifications.docx", 
+            "Bill of Quantities.docx",
+            "Scope of Work.docx",
+            "Supplier SQualification requirements.docx",
+            "form of tender.docx",
+            "confidentiality agreement.docx",
+            "Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx"
+        ]
+
+        for filename in compliance_files:
+            file_path = os.path.join(data_folder, filename)
+            if os.path.exists(file_path):
+                try:
+                    # Extract text from compliance document
+                    doc = Document(file_path)
+                    text = '\n'.join([para.text for para in doc.paragraphs])
+
+                    # Store text and generate embedding
+                    doc_key = filename.replace('.docx', '').replace(' ', '_').lower()
+                    self.compliance_docs[doc_key] = {
+                        'filename': filename,
+                        'content': text,
+                        'sections': self._extract_sections(text)
+                    }
+
+                    # Generate embedding for similarity search
+                    self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text)
+
+                    print(f"Loaded compliance standard: {filename}")
+
+                except Exception as e:
+                    print(f"Error loading {filename}: {str(e)}")
+
+    def _extract_sections(self, text: str) -> List[str]:
+        """Extract key sections from compliance documents"""
+        sections = []
+        lines = text.split('\n')
+        current_section = []
+
+        for line in lines:
+            line = line.strip()
+            if line and (line.isupper() or line.endswith(':') or 
+                        any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])):
+                if current_section:
+                    sections.append('\n'.join(current_section))
+                current_section = [line]
+            elif line:
+                current_section.append(line)
+
+        if current_section:
+            sections.append('\n'.join(current_section))
+
+        return sections
+
+    def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]:
+        """Find relevant compliance standards for a document"""
+        relevant_standards = []
+
+        for doc_key, compliance_embedding in self.compliance_embeddings.items():
+            # Calculate similarity
+            relevant_standards.append({
+                'standard': doc_key,
+                'filename': self.compliance_docs[doc_key]['filename'],
+                'content': self.compliance_docs[doc_key]['content'],
+                'sections': self.compliance_docs[doc_key]['sections']
+            })
+
+        return relevant_standards
+
+    def get_compliance_context(self) -> str:
+        """Get formatted compliance context for LLM prompts"""
+        context = "COMPLIANCE STANDARDS:\n\n"
+
+        for doc_key, doc_data in self.compliance_docs.items():
+            context += f"=== {doc_data['filename']} ===\n"
+            context += f"{doc_data['content'][:1000]}...\n\n"
+
+        return context