96 lines
3.6 KiB
Python
96 lines
3.6 KiB
Python
import os
|
|
from docx import Document
|
|
from typing import Dict, List
|
|
from .config import Config
|
|
from .embeddings import EmbeddingGenerator
|
|
|
|
|
|
class ComplianceLoader:
|
|
def __init__(self):
|
|
self.embedding_generator = EmbeddingGenerator()
|
|
self.compliance_docs = {}
|
|
self.compliance_embeddings = {}
|
|
|
|
def load_compliance_standards(self, data_folder: str = "data/"):
|
|
"""Load all compliance documents and generate embeddings"""
|
|
compliance_files = [
|
|
"Invitation to Tender.docx",
|
|
"Tender Specifications.docx",
|
|
"Bill of Quantities.docx",
|
|
"Scope of Work.docx",
|
|
"Supplier SQualification requirements.docx",
|
|
"form of tender.docx",
|
|
"confidentiality agreement.docx",
|
|
"Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx"
|
|
]
|
|
|
|
for filename in compliance_files:
|
|
file_path = os.path.join(data_folder, filename)
|
|
if os.path.exists(file_path):
|
|
try:
|
|
# Extract text from compliance document
|
|
doc = Document(file_path)
|
|
text = '\n'.join([para.text for para in doc.paragraphs])
|
|
|
|
# Store text and generate embedding
|
|
doc_key = filename.replace('.docx', '').replace(' ', '_').lower()
|
|
self.compliance_docs[doc_key] = {
|
|
'filename': filename,
|
|
'content': text,
|
|
'sections': self._extract_sections(text)
|
|
}
|
|
|
|
# Generate embedding for similarity search
|
|
self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text)
|
|
|
|
print(f"Loaded compliance standard: {filename}")
|
|
|
|
except Exception as e:
|
|
print(f"Error loading {filename}: {str(e)}")
|
|
|
|
def _extract_sections(self, text: str) -> List[str]:
|
|
"""Extract key sections from compliance documents"""
|
|
sections = []
|
|
lines = text.split('\n')
|
|
current_section = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and (line.isupper() or line.endswith(':') or
|
|
any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])):
|
|
if current_section:
|
|
sections.append('\n'.join(current_section))
|
|
current_section = [line]
|
|
elif line:
|
|
current_section.append(line)
|
|
|
|
if current_section:
|
|
sections.append('\n'.join(current_section))
|
|
|
|
return sections
|
|
|
|
def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]:
|
|
"""Find relevant compliance standards for a document"""
|
|
relevant_standards = []
|
|
|
|
for doc_key, compliance_embedding in self.compliance_embeddings.items():
|
|
# Calculate similarity
|
|
relevant_standards.append({
|
|
'standard': doc_key,
|
|
'filename': self.compliance_docs[doc_key]['filename'],
|
|
'content': self.compliance_docs[doc_key]['content'],
|
|
'sections': self.compliance_docs[doc_key]['sections']
|
|
})
|
|
|
|
return relevant_standards
|
|
|
|
def get_compliance_context(self) -> str:
|
|
"""Get formatted compliance context for LLM prompts"""
|
|
context = "COMPLIANCE STANDARDS:\n\n"
|
|
|
|
for doc_key, doc_data in self.compliance_docs.items():
|
|
context += f"=== {doc_data['filename']} ===\n"
|
|
context += f"{doc_data['content'][:1000]}...\n\n"
|
|
|
|
return context
|