update functions
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
import os
|
||||
from docx import Document
|
||||
from typing import Dict, List
|
||||
from .config import Config
|
||||
from .embeddings import EmbeddingGenerator
|
||||
|
||||
|
||||
class ComplianceLoader:
|
||||
def __init__(self):
|
||||
self.embedding_generator = EmbeddingGenerator()
|
||||
self.compliance_docs = {}
|
||||
self.compliance_embeddings = {}
|
||||
|
||||
def load_compliance_standards(self, data_folder: str = "data/"):
|
||||
"""Load all compliance documents and generate embeddings"""
|
||||
compliance_files = [
|
||||
"Invitation to Tender.docx",
|
||||
"Tender Specifications.docx",
|
||||
"Bill of Quantities.docx",
|
||||
"Scope of Work.docx",
|
||||
"Supplier SQualification requirements.docx",
|
||||
"form of tender.docx",
|
||||
"confidentiality agreement.docx",
|
||||
"Project1-FEED CONTRACTOR-MUL-E000-PR-LST-000.docx"
|
||||
]
|
||||
|
||||
for filename in compliance_files:
|
||||
file_path = os.path.join(data_folder, filename)
|
||||
if os.path.exists(file_path):
|
||||
try:
|
||||
# Extract text from compliance document
|
||||
doc = Document(file_path)
|
||||
text = '\n'.join([para.text for para in doc.paragraphs])
|
||||
|
||||
# Store text and generate embedding
|
||||
doc_key = filename.replace('.docx', '').replace(' ', '_').lower()
|
||||
self.compliance_docs[doc_key] = {
|
||||
'filename': filename,
|
||||
'content': text,
|
||||
'sections': self._extract_sections(text)
|
||||
}
|
||||
|
||||
# Generate embedding for similarity search
|
||||
self.compliance_embeddings[doc_key] = self.embedding_generator.generate_embeddings(text)
|
||||
|
||||
print(f"Loaded compliance standard: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading {filename}: {str(e)}")
|
||||
|
||||
def _extract_sections(self, text: str) -> List[str]:
|
||||
"""Extract key sections from compliance documents"""
|
||||
sections = []
|
||||
lines = text.split('\n')
|
||||
current_section = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and (line.isupper() or line.endswith(':') or
|
||||
any(keyword in line.lower() for keyword in ['requirement', 'specification', 'must', 'shall'])):
|
||||
if current_section:
|
||||
sections.append('\n'.join(current_section))
|
||||
current_section = [line]
|
||||
elif line:
|
||||
current_section.append(line)
|
||||
|
||||
if current_section:
|
||||
sections.append('\n'.join(current_section))
|
||||
|
||||
return sections
|
||||
|
||||
def get_relevant_standards(self, document_embedding: List[float], threshold: float = 0.7) -> List[Dict]:
|
||||
"""Find relevant compliance standards for a document"""
|
||||
relevant_standards = []
|
||||
|
||||
for doc_key, compliance_embedding in self.compliance_embeddings.items():
|
||||
# Calculate similarity
|
||||
relevant_standards.append({
|
||||
'standard': doc_key,
|
||||
'filename': self.compliance_docs[doc_key]['filename'],
|
||||
'content': self.compliance_docs[doc_key]['content'],
|
||||
'sections': self.compliance_docs[doc_key]['sections']
|
||||
})
|
||||
|
||||
return relevant_standards
|
||||
|
||||
def get_compliance_context(self) -> str:
|
||||
"""Get formatted compliance context for LLM prompts"""
|
||||
context = "COMPLIANCE STANDARDS:\n\n"
|
||||
|
||||
for doc_key, doc_data in self.compliance_docs.items():
|
||||
context += f"=== {doc_data['filename']} ===\n"
|
||||
context += f"{doc_data['content'][:1000]}...\n\n"
|
||||
|
||||
return context
|
||||
Reference in New Issue
Block a user