Files
ds_task_scp/backend/compliance_loader.py
T
2025-07-14 23:41:31 +01:00

96 lines
3.6 KiB
Python

import os
from docx import Document
from typing import Dict, List
from .config import Config
from .embeddings import EmbeddingGenerator
class ComplianceLoader:
def __init__(self):
self.embedding_generator = EmbeddingGenerator()
self.compliance_docs = {}
self.compliance_embeddings = {}
def load_compliance_standards(self, data_folder: str = "data/"):
"""Load all compliance documents and generate embeddings"""
compliance_files = [
"Invitation to Tender.docx",
"Tender Specifications.docx",
"Bill of Quantities.docx",
"Scope of Work.docx",
"Supplier SQualification requirements.docx",
"form of tender.docx",
"confidentiality agreement.docx",
"Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx"
]
for filename in compliance_files:
file_path = os.path.join(data_folder, filename)
if os.path.exists(file_path):
try:
# Extract text from compliance document
doc = Document(file_path)
text = '\n'.join([para.text for para in doc.paragraphs])
# Store text and generate embedding
doc_key = filename.replace('.docx', '').replace(' ', '_').lower()
self.compliance_docs[doc_key] = {
'filename': filename,
'content': text,
'sections': self._extract_sections(text)
}
# Generate embedding for similarity search
self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text)
print(f"Loaded compliance standard: {filename}")
except Exception as e:
print(f"Error loading {filename}: {str(e)}")
def _extract_sections(self, text: str) -> List[str]:
"""Extract key sections from compliance documents"""
sections = []
lines = text.split('\n')
current_section = []
for line in lines:
line = line.strip()
if line and (line.isupper() or line.endswith(':') or
any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])):
if current_section:
sections.append('\n'.join(current_section))
current_section = [line]
elif line:
current_section.append(line)
if current_section:
sections.append('\n'.join(current_section))
return sections
def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]:
"""Find relevant compliance standards for a document"""
relevant_standards = []
for doc_key, compliance_embedding in self.compliance_embeddings.items():
# Calculate similarity
relevant_standards.append({
'standard': doc_key,
'filename': self.compliance_docs[doc_key]['filename'],
'content': self.compliance_docs[doc_key]['content'],
'sections': self.compliance_docs[doc_key]['sections']
})
return relevant_standards
def get_compliance_context(self) -> str:
"""Get formatted compliance context for LLM prompts"""
context = "COMPLIANCE STANDARDS:\n\n"
for doc_key, doc_data in self.compliance_docs.items():
context += f"=== {doc_data['filename']} ===\n"
context += f"{doc_data['content'][:1000]}...\n\n"
return context