Initial commit
This commit is contained in:
@@ -0,0 +1 @@
|
||||
"""Services for the Mini SpecsComply Pro application."""
|
||||
@@ -0,0 +1,461 @@
|
||||
# Document processing
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, BinaryIO, Tuple
|
||||
import re
|
||||
from loguru import logger
|
||||
|
||||
from app.core.models import (
|
||||
Document,
|
||||
DocumentMetadata,
|
||||
DocumentStatus,
|
||||
ComplianceReport,
|
||||
ComplianceIssue,
|
||||
ComplianceLevel,
|
||||
DocumentEmbedding
|
||||
)
|
||||
from app.services.embedding import EmbeddingService
|
||||
from app.services.reasoning import ReasoningService
|
||||
from app.services.standards import StandardsService
|
||||
from app.utils.token_counter import count_tokens, truncate_by_tokens
|
||||
|
||||
class DocumentService:
|
||||
"""Service for handling document processing and storage."""
|
||||
|
||||
def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
|
||||
"""Initialize with required services."""
|
||||
self.embedding_service = embedding_service
|
||||
self.reasoning_service = reasoning_service
|
||||
self.standards_service = standards_service or StandardsService()
|
||||
self.documents = {} # In-memory storage for documents (replace with DB in production)
|
||||
self.reports = {} # In-memory storage for reports (replace with DB in production)
|
||||
|
||||
async def upload_document(self, file: BinaryIO, filename: str) -> Document:
|
||||
"""
|
||||
Process an uploaded document.
|
||||
|
||||
Args:
|
||||
file: The document file
|
||||
filename: Name of the uploaded file
|
||||
|
||||
Returns:
|
||||
Document object with metadata
|
||||
"""
|
||||
# Validate file type
|
||||
if not self._validate_file_type(filename):
|
||||
raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")
|
||||
|
||||
# Get file content
|
||||
content = await self._read_file_content(file)
|
||||
|
||||
# Extract file metadata
|
||||
file_size = len(content)
|
||||
file_type = self._get_file_type(filename)
|
||||
|
||||
# Create document metadata
|
||||
metadata = DocumentMetadata(
|
||||
filename=filename,
|
||||
file_type=file_type,
|
||||
file_size=file_size,
|
||||
upload_timestamp=datetime.now(),
|
||||
last_modified=datetime.now()
|
||||
)
|
||||
|
||||
# Create document object
|
||||
document_id = str(uuid.uuid4())
|
||||
document = Document(
|
||||
id=document_id,
|
||||
metadata=metadata,
|
||||
status=DocumentStatus.PENDING,
|
||||
version=1
|
||||
)
|
||||
|
||||
# Store document in memory
|
||||
self.documents[document_id] = document
|
||||
|
||||
# Start processing
|
||||
try:
|
||||
await self._process_document(document_id, content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing document {document_id}: {str(e)}")
|
||||
document.status = DocumentStatus.FAILED
|
||||
raise
|
||||
|
||||
return document
|
||||
|
||||
async def get_document(self, document_id: str) -> Optional[Document]:
|
||||
"""
|
||||
Retrieve a document by ID.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document to retrieve
|
||||
|
||||
Returns:
|
||||
Document object if found, None otherwise
|
||||
"""
|
||||
return self.documents.get(document_id)
|
||||
|
||||
async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
|
||||
"""
|
||||
Retrieve a compliance report by ID.
|
||||
|
||||
Args:
|
||||
report_id: The ID of the report to retrieve
|
||||
|
||||
Returns:
|
||||
ComplianceReport object if found, None otherwise
|
||||
"""
|
||||
return self.reports.get(report_id)
|
||||
|
||||
async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
|
||||
"""
|
||||
Resubmit a document with changes.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document to resubmit
|
||||
file: The updated document file
|
||||
|
||||
Returns:
|
||||
Updated Document object
|
||||
"""
|
||||
# Check if document exists
|
||||
document = await self.get_document(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document with ID {document_id} not found")
|
||||
|
||||
# Get file content
|
||||
content = await self._read_file_content(file)
|
||||
|
||||
# Update document metadata
|
||||
document.metadata.file_size = len(content)
|
||||
document.metadata.last_modified = datetime.now()
|
||||
document.version += 1
|
||||
document.status = DocumentStatus.PENDING
|
||||
|
||||
# Process the updated document
|
||||
try:
|
||||
await self._process_document(document_id, content)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
|
||||
document.status = DocumentStatus.FAILED
|
||||
|
||||
return document
|
||||
|
||||
async def process_document(self, document_id: str, content: str) -> ComplianceReport:
|
||||
"""
|
||||
Process document and generate compliance report.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document
|
||||
content: Document content
|
||||
|
||||
Returns:
|
||||
ComplianceReport object
|
||||
"""
|
||||
try:
|
||||
# Get the document
|
||||
document = self.documents.get(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
# Split document into sections
|
||||
sections = self._split_into_sections(content)
|
||||
|
||||
# Generate embeddings for sections
|
||||
document.embedding = await self.embedding_service.embed_document(document_id, sections)
|
||||
|
||||
# Identify relevant standards for the document
|
||||
if self.standards_service:
|
||||
# Log the standards service instance ID to verify singleton pattern
|
||||
logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
|
||||
logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")
|
||||
|
||||
standard_names = await self.standards_service.get_standard_names_for_document(content)
|
||||
logger.info(f"Identified standards for document {document_id}: {standard_names}")
|
||||
else:
|
||||
logger.warning(f"No StandardsService available for document {document_id}")
|
||||
standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]
|
||||
|
||||
# Use reasoning service for compliance analysis
|
||||
report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)
|
||||
|
||||
# Store the report
|
||||
self.reports[report.report_id] = report
|
||||
|
||||
return report
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in document processing: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _read_file_content(self, file: BinaryIO) -> str:
|
||||
"""
|
||||
Read and decode file content.
|
||||
|
||||
Args:
|
||||
file: The file to read
|
||||
|
||||
Returns:
|
||||
File content as string
|
||||
"""
|
||||
file_content = file.read()
|
||||
|
||||
# Try to decode as UTF-8
|
||||
try:
|
||||
return file_content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# Try other encodings if UTF-8 fails
|
||||
try:
|
||||
return file_content.decode('latin-1')
|
||||
except:
|
||||
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
|
||||
|
||||
def _get_file_type(self, filename: str) -> str:
|
||||
"""
|
||||
Determine file type from filename.
|
||||
|
||||
Args:
|
||||
filename: The name of the file
|
||||
|
||||
Returns:
|
||||
File type (extension)
|
||||
"""
|
||||
_, extension = os.path.splitext(filename)
|
||||
return extension.lstrip('.').lower()
|
||||
|
||||
def _validate_file_type(self, filename: str) -> bool:
|
||||
"""
|
||||
Validate if the file type is supported.
|
||||
|
||||
Args:
|
||||
filename: Name of the file to validate
|
||||
|
||||
Returns:
|
||||
bool: True if file type is supported, False otherwise
|
||||
"""
|
||||
SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
|
||||
_, ext = os.path.splitext(filename)
|
||||
return ext.lower() in SUPPORTED_EXTENSIONS
|
||||
|
||||
def _split_into_sections(self, content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Split document content into sections.
|
||||
|
||||
Args:
|
||||
content: The document content
|
||||
|
||||
Returns:
|
||||
Dictionary mapping section names to section content
|
||||
"""
|
||||
# This is a simple implementation - in production, you would use more advanced
|
||||
# techniques like heading detection, markdown parsing, etc.
|
||||
|
||||
# For simplicity, we'll just split by markdown headings
|
||||
sections = {}
|
||||
|
||||
# Add the whole document as one section
|
||||
sections["full_document"] = content
|
||||
|
||||
# Try to split by markdown headings
|
||||
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
|
||||
matches = list(heading_pattern.finditer(content))
|
||||
|
||||
if matches:
|
||||
for i, match in enumerate(matches):
|
||||
heading_level = len(match.group(1))
|
||||
section_name = match.group(2).strip()
|
||||
|
||||
# Get section content (from this heading to the next, or to the end)
|
||||
start_pos = match.end()
|
||||
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)
|
||||
|
||||
section_content = content[start_pos:end_pos].strip()
|
||||
section_key = f"h{heading_level}_{section_name}"
|
||||
|
||||
sections[section_key] = section_content
|
||||
else:
|
||||
# No headings found, try to split by newlines into paragraphs
|
||||
paragraphs = [p for p in content.split('\n\n') if p.strip()]
|
||||
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if len(paragraph) > 100: # Only include substantial paragraphs
|
||||
sections[f"paragraph_{i+1}"] = paragraph
|
||||
|
||||
return sections
|
||||
|
||||
async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
|
||||
"""
|
||||
Generate a mock compliance report for development/testing.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document
|
||||
sections: Dictionary of document sections
|
||||
|
||||
Returns:
|
||||
ComplianceReport object
|
||||
"""
|
||||
# In production, this would use the reasoning service
|
||||
# For now, we'll generate a simple mock report
|
||||
|
||||
# Create some mock issues
|
||||
issues = []
|
||||
|
||||
if "full_document" in sections:
|
||||
content = sections["full_document"]
|
||||
|
||||
# Check for missing sections (mock check)
|
||||
if "introduction" not in content.lower():
|
||||
issues.append(ComplianceIssue(
|
||||
section="Document Structure",
|
||||
description="Missing introduction section",
|
||||
level=ComplianceLevel.MAJOR,
|
||||
recommendation="Add an introduction section to provide context for the document"
|
||||
))
|
||||
|
||||
# Check for formatting issues (mock check)
|
||||
if content.count('#') < 3:
|
||||
issues.append(ComplianceIssue(
|
||||
section="Formatting",
|
||||
description="Insufficient section headings",
|
||||
level=ComplianceLevel.MINOR,
|
||||
recommendation="Use markdown headings to better structure the document"
|
||||
))
|
||||
|
||||
# Check for technical compliance (mock check)
|
||||
if "compliance" in content.lower() and "standard" not in content.lower():
|
||||
issues.append(ComplianceIssue(
|
||||
section="Technical Content",
|
||||
description="Mentions compliance but doesn't reference specific standards",
|
||||
level=ComplianceLevel.CRITICAL,
|
||||
recommendation="Specify which standards or regulations the document complies with"
|
||||
))
|
||||
|
||||
# Calculate mock compliance score
|
||||
if issues:
|
||||
compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
|
||||
else:
|
||||
compliance_score = 1.0
|
||||
|
||||
# Create summary based on issues
|
||||
if not issues:
|
||||
summary = "The document meets all compliance requirements. No issues found."
|
||||
else:
|
||||
critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
|
||||
major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
|
||||
minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)
|
||||
|
||||
summary = f"The document has {len(issues)} compliance issues: "
|
||||
if critical_count:
|
||||
summary += f"{critical_count} critical, "
|
||||
if major_count:
|
||||
summary += f"{major_count} major, "
|
||||
if minor_count:
|
||||
summary += f"{minor_count} minor."
|
||||
else:
|
||||
summary = summary.rstrip(", ") + "."
|
||||
|
||||
summary += " See detailed report for recommendations."
|
||||
|
||||
# Create report
|
||||
report = ComplianceReport(
|
||||
document_id=document_id,
|
||||
compliance_score=compliance_score,
|
||||
summary=summary,
|
||||
issues=issues
|
||||
)
|
||||
|
||||
return report
|
||||
|
||||
async def _process_document(self, document_id: str, content: str) -> None:
|
||||
"""
|
||||
Internal method to process a document and update its status.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document to process
|
||||
content: The document content
|
||||
"""
|
||||
try:
|
||||
# Get the document
|
||||
document = self.documents.get(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
# Update status to processing
|
||||
document.status = DocumentStatus.PROCESSING
|
||||
|
||||
# Generate compliance report
|
||||
report = await self.process_document(document_id, content)
|
||||
|
||||
# Store report ID in document
|
||||
document.reports.append(report.report_id)
|
||||
|
||||
# Update document status
|
||||
document.status = DocumentStatus.COMPLETED
|
||||
|
||||
except Exception as e:
|
||||
# Update document status to failed
|
||||
if document:
|
||||
document.status = DocumentStatus.FAILED
|
||||
raise
|
||||
|
||||
async def get_document_stats(self, document_id: str) -> Dict[str, any]:
|
||||
"""
|
||||
Get statistics for a document.
|
||||
|
||||
Args:
|
||||
document_id: The ID of the document
|
||||
|
||||
Returns:
|
||||
Dictionary containing document statistics
|
||||
"""
|
||||
document = await self.get_document(document_id)
|
||||
if not document:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
latest_report = None
|
||||
if document.reports:
|
||||
latest_report = await self.get_report(document.reports[-1])
|
||||
|
||||
stats = {
|
||||
"document_id": document_id,
|
||||
"version": document.version,
|
||||
"status": document.status,
|
||||
"file_size": document.metadata.file_size,
|
||||
"upload_date": document.metadata.upload_timestamp,
|
||||
"last_modified": document.metadata.last_modified,
|
||||
"num_reports": len(document.reports),
|
||||
"latest_compliance_score": latest_report.compliance_score if latest_report else None,
|
||||
"critical_issues": latest_report.critical_issues_count if latest_report else 0,
|
||||
"major_issues": latest_report.major_issues_count if latest_report else 0,
|
||||
"minor_issues": latest_report.minor_issues_count if latest_report else 0
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
async def cleanup_old_documents(self, days: int = 30) -> List[str]:
|
||||
"""
|
||||
Remove documents older than specified days.
|
||||
|
||||
Args:
|
||||
days: Number of days after which documents should be removed
|
||||
|
||||
Returns:
|
||||
List of removed document IDs
|
||||
"""
|
||||
cutoff_date = datetime.now() - timedelta(days=days)
|
||||
removed_ids = []
|
||||
|
||||
for doc_id, document in list(self.documents.items()):
|
||||
if document.metadata.upload_timestamp < cutoff_date:
|
||||
# Remove associated reports
|
||||
for report_id in document.reports:
|
||||
self.reports.pop(report_id, None)
|
||||
|
||||
# Remove document
|
||||
self.documents.pop(doc_id)
|
||||
removed_ids.append(doc_id)
|
||||
|
||||
return removed_ids
|
||||
|
||||
|
||||
@@ -0,0 +1,254 @@
|
||||
import cohere
|
||||
from typing import List, Dict, Any, Optional
|
||||
import uuid
|
||||
from pinecone import Pinecone
|
||||
import weaviate
|
||||
from loguru import logger
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.models import DocumentEmbedding
|
||||
|
||||
class EmbeddingService:
|
||||
"""Service for document embedding and vector database operations."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the embedding service with the Cohere client and vector DB."""
|
||||
# Initialize Cohere client
|
||||
self.cohere_client = cohere.Client(settings.COHERE_API_KEY)
|
||||
|
||||
# Initialize vector database client based on configuration
|
||||
self.vector_db_client = self._init_vector_db()
|
||||
self.embedding_model = settings.EMBEDDING_MODEL
|
||||
|
||||
def _init_vector_db(self) -> Any:
|
||||
"""Initialize the vector database client based on settings."""
|
||||
if settings.VECTOR_DB == "pinecone" and settings.PINECONE_API_KEY:
|
||||
# Initialize Pinecone with new API
|
||||
pc = Pinecone(api_key=settings.PINECONE_API_KEY)
|
||||
|
||||
# Check if index exists, if not create it
|
||||
if settings.PINECONE_INDEX_NAME not in [idx["name"] for idx in pc.list_indexes()]:
|
||||
pc.create_index(
|
||||
name=settings.PINECONE_INDEX_NAME,
|
||||
dimension=1024, # Cohere embed-english-v3.0 dimension
|
||||
metric="cosine"
|
||||
)
|
||||
|
||||
# Return the index
|
||||
return pc.Index(settings.PINECONE_INDEX_NAME)
|
||||
|
||||
elif settings.VECTOR_DB == "weaviate" and settings.WEAVIATE_URL:
|
||||
# Initialize Weaviate
|
||||
auth_config = weaviate.auth.AuthApiKey(api_key=settings.WEAVIATE_API_KEY) if settings.WEAVIATE_API_KEY else None
|
||||
client = weaviate.Client(
|
||||
url=settings.WEAVIATE_URL,
|
||||
auth_client_secret=auth_config
|
||||
)
|
||||
# Check if schema exists, if not create it
|
||||
if not client.schema.contains().get("classes", []):
|
||||
class_obj = {
|
||||
"class": "Document",
|
||||
"vectorizer": "none", # We'll provide our own vectors
|
||||
"properties": [
|
||||
{
|
||||
"name": "content",
|
||||
"dataType": ["text"]
|
||||
},
|
||||
{
|
||||
"name": "document_id",
|
||||
"dataType": ["string"]
|
||||
},
|
||||
{
|
||||
"name": "section_name",
|
||||
"dataType": ["string"]
|
||||
}
|
||||
]
|
||||
}
|
||||
client.schema.create_class(class_obj)
|
||||
return client
|
||||
|
||||
else:
|
||||
logger.warning("No valid vector database configuration found. Using mock implementation.")
|
||||
return MockVectorDB()
|
||||
|
||||
async def embed_document(self, document_id: str, sections: Dict[str, str]) -> DocumentEmbedding:
|
||||
"""
|
||||
Embed document sections and store in vector database.
|
||||
|
||||
Args:
|
||||
document_id: Unique identifier for the document
|
||||
sections: Dictionary mapping section names to section content
|
||||
|
||||
Returns:
|
||||
DocumentEmbedding object with embedding metadata
|
||||
"""
|
||||
section_ids = {}
|
||||
|
||||
for section_name, content in sections.items():
|
||||
# Generate embedding for section content
|
||||
try:
|
||||
embedding_response = self.cohere_client.embed(
|
||||
texts=[content],
|
||||
model=self.embedding_model,
|
||||
input_type="search_document"
|
||||
)
|
||||
embedding_vector = embedding_response.embeddings[0]
|
||||
|
||||
# Generate a unique ID for this section
|
||||
section_id = f"{document_id}_{section_name}_{str(uuid.uuid4())[:8]}"
|
||||
|
||||
# Store in vector database
|
||||
if settings.VECTOR_DB == "pinecone":
|
||||
self.vector_db_client.upsert(
|
||||
vectors=[{
|
||||
"id": section_id,
|
||||
"values": embedding_vector,
|
||||
"metadata": {
|
||||
"document_id": document_id,
|
||||
"section_name": section_name,
|
||||
"content": content[:1000] # Store truncated content for context
|
||||
}
|
||||
}],
|
||||
namespace=document_id
|
||||
)
|
||||
|
||||
elif settings.VECTOR_DB == "weaviate":
|
||||
self.vector_db_client.data_object.create(
|
||||
class_name="Document",
|
||||
data_object={
|
||||
"content": content,
|
||||
"document_id": document_id,
|
||||
"section_name": section_name
|
||||
},
|
||||
uuid=section_id,
|
||||
vector=embedding_vector
|
||||
)
|
||||
|
||||
# Store the section ID
|
||||
section_ids[section_name] = section_id
|
||||
logger.info(f"Successfully embedded section '{section_name}' for document {document_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error embedding section '{section_name}': {str(e)}")
|
||||
raise
|
||||
|
||||
# Create and return DocumentEmbedding object
|
||||
embedding = DocumentEmbedding(
|
||||
embedding_id=str(uuid.uuid4()),
|
||||
embedding_model=self.embedding_model,
|
||||
vector_db=settings.VECTOR_DB,
|
||||
sections=section_ids
|
||||
)
|
||||
|
||||
return embedding
|
||||
|
||||
async def retrieve_similar_sections(self, query: str, document_id: Optional[str] = None, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Retrieve similar document sections for a query.
|
||||
|
||||
Args:
|
||||
query: The query text to find similar sections for
|
||||
document_id: Optional document ID to restrict search
|
||||
top_k: Number of results to return
|
||||
|
||||
Returns:
|
||||
List of similar sections with metadata
|
||||
"""
|
||||
# Generate embedding for query
|
||||
query_embedding = self.cohere_client.embed(
|
||||
texts=[query],
|
||||
model=self.embedding_model,
|
||||
input_type="search_query"
|
||||
).embeddings[0]
|
||||
|
||||
# Search vector database
|
||||
if settings.VECTOR_DB == "pinecone":
|
||||
namespace = document_id if document_id else None
|
||||
results = self.vector_db_client.query(
|
||||
vector=query_embedding,
|
||||
top_k=top_k,
|
||||
namespace=namespace,
|
||||
include_metadata=True
|
||||
)
|
||||
|
||||
# Format results
|
||||
similar_sections = []
|
||||
for match in results.matches:
|
||||
similar_sections.append({
|
||||
"section_id": match.id,
|
||||
"document_id": match.metadata["document_id"],
|
||||
"section_name": match.metadata["section_name"],
|
||||
"content": match.metadata.get("content", ""),
|
||||
"score": match.score
|
||||
})
|
||||
|
||||
elif settings.VECTOR_DB == "weaviate":
|
||||
query_builder = self.vector_db_client.query.get(
|
||||
"Document", ["content", "document_id", "section_name"]
|
||||
).with_near_vector({
|
||||
"vector": query_embedding
|
||||
}).with_limit(top_k)
|
||||
|
||||
if document_id:
|
||||
query_builder = query_builder.with_where({
|
||||
"path": ["document_id"],
|
||||
"operator": "Equal",
|
||||
"valueString": document_id
|
||||
})
|
||||
|
||||
results = query_builder.do()
|
||||
|
||||
# Format results
|
||||
similar_sections = []
|
||||
for item in results.get("data", {}).get("Get", {}).get("Document", []):
|
||||
similar_sections.append({
|
||||
"section_id": item.get("_additional", {}).get("id"),
|
||||
"document_id": item.get("document_id"),
|
||||
"section_name": item.get("section_name"),
|
||||
"content": item.get("content", ""),
|
||||
"score": item.get("_additional", {}).get("distance")
|
||||
})
|
||||
|
||||
else:
|
||||
# Mock implementation
|
||||
similar_sections = []
|
||||
|
||||
return similar_sections
|
||||
|
||||
|
||||
class MockVectorDB:
|
||||
"""Mock vector database for development without actual vector DB."""
|
||||
|
||||
def __init__(self):
|
||||
self.vectors = {}
|
||||
logger.warning("Using mock vector database. Not suitable for production.")
|
||||
|
||||
def upsert(self, vectors, namespace=None):
|
||||
"""Mock upsert method."""
|
||||
namespace = namespace or "default"
|
||||
if namespace not in self.vectors:
|
||||
self.vectors[namespace] = {}
|
||||
|
||||
for vector in vectors:
|
||||
vector_id = vector['id']
|
||||
metadata = vector['metadata']
|
||||
self.vectors[namespace][vector_id] = metadata
|
||||
|
||||
def query(self, vector, top_k=5, namespace=None, include_metadata=True):
|
||||
"""Mock query method."""
|
||||
from collections import namedtuple
|
||||
|
||||
namespace = namespace or "default"
|
||||
if namespace not in self.vectors:
|
||||
return []
|
||||
|
||||
# Just return some mock results
|
||||
Match = namedtuple('Match', ['id', 'score', 'metadata'])
|
||||
Results = namedtuple('Results', ['matches'])
|
||||
|
||||
matches = [
|
||||
Match(id=vector_id, score=0.8, metadata=metadata)
|
||||
for vector_id, metadata in list(self.vectors[namespace].items())[:top_k]
|
||||
]
|
||||
|
||||
return Results(matches=matches)
|
||||
@@ -0,0 +1,136 @@
|
||||
# Reranking services
|
||||
import cohere
|
||||
from typing import List, Dict, Any
|
||||
from loguru import logger
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.models import ComplianceIssue, ComplianceReport, ComplianceLevel
|
||||
|
||||
class RankingService:
|
||||
"""Service for ranking and prioritizing compliance issues using Cohere Reranker."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the ranking service with the Cohere client."""
|
||||
self.cohere_client = cohere.Client(settings.COHERE_API_KEY)
|
||||
self.reranker_model = settings.RERANKER_MODEL
|
||||
|
||||
async def prioritize_issues(self, report: ComplianceReport, max_issues: int = 10) -> ComplianceReport:
|
||||
"""
|
||||
Prioritize and rank compliance issues in a report.
|
||||
|
||||
Args:
|
||||
report: The compliance report with issues to prioritize
|
||||
max_issues: Maximum number of issues to include in the final report
|
||||
|
||||
Returns:
|
||||
Updated compliance report with prioritized issues
|
||||
"""
|
||||
if not report.issues or len(report.issues) <= 1:
|
||||
# No need to rank if there's only 0 or 1 issues
|
||||
return report
|
||||
|
||||
try:
|
||||
# Prepare issues for ranking
|
||||
issue_texts = [
|
||||
f"Section: {issue.section}. "
|
||||
f"Level: {issue.level.value}. "
|
||||
f"Description: {issue.description}. "
|
||||
f"Recommendation: {issue.recommendation}"
|
||||
for issue in report.issues
|
||||
]
|
||||
|
||||
# Query object representing what we're looking for
|
||||
query = "critical compliance issues that require immediate attention"
|
||||
|
||||
# Rerank issues based on relevance to the query
|
||||
reranked_issues = await self._rerank_issues(query, issue_texts)
|
||||
|
||||
# Sort issues based on:
|
||||
# 1. Compliance level (critical > major > minor > info)
|
||||
# 2. Reranker relevance score
|
||||
sorted_issues = []
|
||||
level_scores = {
|
||||
ComplianceLevel.CRITICAL: 4,
|
||||
ComplianceLevel.MAJOR: 3,
|
||||
ComplianceLevel.MINOR: 2,
|
||||
ComplianceLevel.INFO: 1
|
||||
}
|
||||
|
||||
# Combine original issues with reranked scores
|
||||
combined_issues = []
|
||||
for i, issue in enumerate(report.issues):
|
||||
rerank_score = next((item["relevance_score"] for item in reranked_issues
|
||||
if item["index"] == i), 0.0)
|
||||
|
||||
# Calculate combined score (level_score * 100 + rerank_score)
|
||||
# This ensures level is always the primary sorting factor
|
||||
level_score = level_scores.get(issue.level, 0)
|
||||
combined_score = (level_score * 100) + rerank_score
|
||||
|
||||
combined_issues.append({
|
||||
"issue": issue,
|
||||
"combined_score": combined_score,
|
||||
"rerank_score": rerank_score
|
||||
})
|
||||
|
||||
# Sort by combined score (descending)
|
||||
combined_issues.sort(key=lambda x: x["combined_score"], reverse=True)
|
||||
|
||||
# Take top issues based on max_issues limit
|
||||
sorted_issues = [item["issue"] for item in combined_issues[:max_issues]]
|
||||
|
||||
# Create updated report
|
||||
prioritized_report = ComplianceReport(
|
||||
report_id=report.report_id,
|
||||
document_id=report.document_id,
|
||||
timestamp=report.timestamp,
|
||||
compliance_score=report.compliance_score,
|
||||
summary=report.summary,
|
||||
issues=sorted_issues
|
||||
)
|
||||
|
||||
return prioritized_report
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error prioritizing issues: {str(e)}")
|
||||
# If ranking fails, return the original report
|
||||
return report
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
async def _rerank_issues(self, query: str, issue_texts: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Rerank issues using Cohere Reranker.
|
||||
|
||||
Args:
|
||||
query: The search query to compare issues against
|
||||
issue_texts: List of issue descriptions to rank
|
||||
|
||||
Returns:
|
||||
List of dictionaries with reranked issues and scores
|
||||
"""
|
||||
try:
|
||||
# Call Cohere Rerank endpoint
|
||||
response = self.cohere_client.rerank(
|
||||
model=self.reranker_model,
|
||||
query=query,
|
||||
documents=issue_texts,
|
||||
top_n=len(issue_texts)
|
||||
)
|
||||
|
||||
# Format results
|
||||
reranked_issues = []
|
||||
for result in response.results:
|
||||
reranked_issues.append({
|
||||
"index": result.index, # Original index in the issues list
|
||||
"relevance_score": result.relevance_score
|
||||
})
|
||||
|
||||
return reranked_issues
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Cohere Reranker: {str(e)}")
|
||||
|
||||
# Return basic ranking if reranking fails
|
||||
return [{"index": i, "relevance_score": 1.0 - (i * 0.1)}
|
||||
for i in range(len(issue_texts))]
|
||||
@@ -0,0 +1,168 @@
|
||||
# Reasoning with LLMs
|
||||
# Reasoning with LLMs using GROQ
|
||||
import json
|
||||
from typing import Dict, List
|
||||
from loguru import logger
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.models import ComplianceIssue, ComplianceLevel, ComplianceReport
|
||||
from app.utils.token_counter import count_tokens, truncate_by_tokens
|
||||
from groq import Groq # Assuming groq Python SDK is installed
|
||||
|
||||
class ReasoningService:
|
||||
"""Service for performing deep reasoning on documents using Groq."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the reasoning service with the Groq client."""
|
||||
self.client = Groq(api_key=settings.GROQ_API_KEY)
|
||||
self.model = settings.REASONING_MODEL # e.g., "mixtral-8x7b-32768"
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
async def analyze_document(self, document_id: str, sections: Dict[str, str], standards: List[str]) -> ComplianceReport:
|
||||
document_content = "\n\n".join([f"# {name}\n{content}" for name, content in sections.items()])
|
||||
|
||||
# Use token-based truncation instead of character-based
|
||||
max_tokens = 30000 # Adjust based on model context window
|
||||
token_count = count_tokens(document_content)
|
||||
|
||||
logger.info(f"Document {document_id} has {token_count} tokens before truncation")
|
||||
|
||||
if token_count > max_tokens:
|
||||
document_content = truncate_by_tokens(document_content, max_tokens)
|
||||
logger.info(f"Document {document_id} truncated to {max_tokens} tokens")
|
||||
|
||||
prompt = self._create_analysis_prompt(document_content, standards)
|
||||
|
||||
try:
|
||||
response = await self._query_groq(prompt)
|
||||
compliance_report = self._parse_compliance_response(document_id, response, standards)
|
||||
return compliance_report
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing document with Groq: {str(e)}")
|
||||
raise
|
||||
|
||||
def _create_analysis_prompt(self, document_content: str, standards: List[str]) -> str:
|
||||
standards_text = "\n".join([f"- {standard}" for standard in standards])
|
||||
return f"""<document>
|
||||
{document_content}
|
||||
</document>
|
||||
|
||||
<standards>
|
||||
{standards_text}
|
||||
</standards>
|
||||
|
||||
You are an expert in document compliance and technical specifications. Please analyze the document above against the listed standards.
|
||||
|
||||
Your job is to identify compliance issues and provide detailed reasoning and recommendations. Focus on:
|
||||
1. Technical accuracy and completeness
|
||||
2. Compliance with the specified standards
|
||||
3. Document structure and organization
|
||||
4. Clarity and specificity of language
|
||||
5. Consistency and coherence
|
||||
|
||||
For each compliance issue you find, please provide:
|
||||
- The section where the issue appears
|
||||
- A detailed description of the issue
|
||||
- The severity level (critical, major, minor, or info)
|
||||
- A thorough explanation of why this is an issue and how it impacts compliance
|
||||
- Specific, actionable recommendations to fix the issue
|
||||
- References to specific standards or best practices that apply
|
||||
|
||||
Respond in the following JSON format:
|
||||
{{
|
||||
"summary": "Comprehensive overall assessment of the document",
|
||||
"compliance_score": 0.0 to 1.0,
|
||||
"issues": [
|
||||
{{
|
||||
"section": "Section name",
|
||||
"description": "Detailed issue description",
|
||||
"level": "critical/major/minor/info",
|
||||
"reasoning": "Thorough explanation of why this is an issue",
|
||||
"standard_references": ["Specific standards or requirements that are violated"],
|
||||
"recommendation": "Detailed, actionable recommendation to fix the issue"
|
||||
}}
|
||||
]
|
||||
}}"""
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
|
||||
async def _query_groq(self, prompt: str) -> str:
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are an AI assistant specialized in document compliance analysis."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
max_tokens=4000,
|
||||
temperature=0.2,
|
||||
top_p=1.0
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
logger.error(f"Error querying Groq: {str(e)}")
|
||||
raise
|
||||
|
||||
def _parse_compliance_response(self, document_id: str, response: str, standards: List[str]) -> ComplianceReport:
|
||||
try:
|
||||
json_start = response.find('{')
|
||||
json_end = response.rfind('}') + 1
|
||||
|
||||
if json_start == -1 or json_end == 0:
|
||||
raise ValueError("Could not find JSON in response")
|
||||
|
||||
json_response = response[json_start:json_end]
|
||||
data = json.loads(json_response)
|
||||
|
||||
summary = data.get("summary", "No summary provided")
|
||||
compliance_score = float(data.get("compliance_score", 0.5))
|
||||
issues = []
|
||||
|
||||
for issue_data in data.get("issues", []):
|
||||
level_str = issue_data.get("level", "minor").lower()
|
||||
if level_str == "critical":
|
||||
level = ComplianceLevel.CRITICAL
|
||||
elif level_str == "major":
|
||||
level = ComplianceLevel.MAJOR
|
||||
elif level_str == "info":
|
||||
level = ComplianceLevel.INFO
|
||||
else:
|
||||
level = ComplianceLevel.MINOR
|
||||
|
||||
issues.append(ComplianceIssue(
|
||||
section=issue_data.get("section", "Unknown"),
|
||||
description=issue_data.get("description", "No description provided"),
|
||||
level=level,
|
||||
reasoning=issue_data.get("reasoning", "No detailed reasoning provided"),
|
||||
standard_references=issue_data.get("standard_references", []),
|
||||
recommendation=issue_data.get("recommendation", "No recommendation provided")
|
||||
))
|
||||
|
||||
return ComplianceReport(
|
||||
document_id=document_id,
|
||||
compliance_score=compliance_score,
|
||||
summary=summary,
|
||||
issues=issues,
|
||||
applied_standards=standards
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
logger.error("Failed to parse JSON from response")
|
||||
return ComplianceReport(
|
||||
document_id=document_id,
|
||||
compliance_score=0.0,
|
||||
summary="Failed to analyze document due to parsing error.",
|
||||
issues=[
|
||||
ComplianceIssue(
|
||||
section="System",
|
||||
description="Failed to parse compliance analysis results.",
|
||||
level=ComplianceLevel.CRITICAL,
|
||||
reasoning="The system encountered an error while parsing the compliance analysis results.",
|
||||
standard_references=[],
|
||||
recommendation="Please try resubmitting the document or contact support."
|
||||
)
|
||||
],
|
||||
applied_standards=[]
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing compliance response: {str(e)}")
|
||||
raise
|
||||
@@ -0,0 +1,250 @@
|
||||
# Standards management
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, List, Optional, BinaryIO, Tuple
|
||||
import uuid
|
||||
from loguru import logger
|
||||
|
||||
from app.core.models import Standard, Requirement, RequirementSeverity
|
||||
from app.utils.helpers import load_standards_from_file
|
||||
from app.services.standards_matcher import StandardsMatcher
|
||||
|
||||
# Singleton instance to ensure all parts of the application use the same standards
|
||||
_standards_service_instance = None
|
||||
|
||||
class StandardsService:
|
||||
"""Service for managing compliance standards."""
|
||||
|
||||
def __new__(cls):
|
||||
"""Implement singleton pattern to ensure all parts of the app use the same standards."""
|
||||
global _standards_service_instance
|
||||
if _standards_service_instance is None:
|
||||
_standards_service_instance = super(StandardsService, cls).__new__(cls)
|
||||
_standards_service_instance.standards = {} # In-memory storage for standards
|
||||
_standards_service_instance.matcher = StandardsMatcher() # Advanced standards matching logic
|
||||
_standards_service_instance._load_default_standards()
|
||||
return _standards_service_instance
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the standards service."""
|
||||
# Initialization is done in __new__ for the singleton pattern
|
||||
|
||||
def _load_default_standards(self):
|
||||
"""Load default standards from the standards directory."""
|
||||
standards_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "standard")
|
||||
|
||||
if not os.path.exists(standards_dir):
|
||||
logger.warning(f"Standards directory not found: {standards_dir}")
|
||||
return
|
||||
|
||||
for filename in os.listdir(standards_dir):
|
||||
if filename.endswith(".json"):
|
||||
try:
|
||||
file_path = os.path.join(standards_dir, filename)
|
||||
standards_data = load_standards_from_file(file_path)
|
||||
|
||||
if "standards" in standards_data:
|
||||
for std_data in standards_data["standards"]:
|
||||
standard = self._create_standard_from_data(std_data)
|
||||
self.standards[standard.id] = standard
|
||||
logger.info(f"Loaded standard: {standard.name} ({standard.id})")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading standard from {filename}: {str(e)}")
|
||||
|
||||
def _create_standard_from_data(self, data: Dict) -> Standard:
|
||||
"""
|
||||
Create a Standard object from dictionary data.
|
||||
|
||||
Args:
|
||||
data: Dictionary containing standard data
|
||||
|
||||
Returns:
|
||||
Standard object
|
||||
"""
|
||||
requirements = []
|
||||
|
||||
if "requirements" in data:
|
||||
for req_data in data["requirements"]:
|
||||
# Map severity string to RequirementSeverity enum
|
||||
severity_str = req_data.get("severity", "minor").lower()
|
||||
if severity_str == "critical":
|
||||
severity = RequirementSeverity.CRITICAL
|
||||
elif severity_str == "major":
|
||||
severity = RequirementSeverity.MAJOR
|
||||
elif severity_str == "info":
|
||||
severity = RequirementSeverity.INFO
|
||||
else:
|
||||
severity = RequirementSeverity.MINOR
|
||||
|
||||
requirement = Requirement(
|
||||
id=req_data.get("id", str(uuid.uuid4())),
|
||||
description=req_data.get("description", ""),
|
||||
severity=severity,
|
||||
details=req_data.get("details", None)
|
||||
)
|
||||
requirements.append(requirement)
|
||||
|
||||
return Standard(
|
||||
id=data.get("id", str(uuid.uuid4())),
|
||||
name=data.get("name", "Unnamed Standard"),
|
||||
description=data.get("description", ""),
|
||||
requirements=requirements
|
||||
)
|
||||
|
||||
async def get_all_standards(self) -> List[Standard]:
|
||||
"""
|
||||
Get all available standards.
|
||||
|
||||
Returns:
|
||||
List of Standard objects
|
||||
"""
|
||||
return list(self.standards.values())
|
||||
|
||||
async def get_standard(self, standard_id: str) -> Optional[Standard]:
|
||||
"""
|
||||
Get a standard by ID.
|
||||
|
||||
Args:
|
||||
standard_id: ID of the standard to retrieve
|
||||
|
||||
Returns:
|
||||
Standard object if found, None otherwise
|
||||
"""
|
||||
return self.standards.get(standard_id)
|
||||
|
||||
async def get_standard_by_name(self, name: str) -> Optional[Standard]:
|
||||
"""
|
||||
Get a standard by name (case-insensitive).
|
||||
|
||||
Args:
|
||||
name: Name of the standard to retrieve
|
||||
|
||||
Returns:
|
||||
Standard object if found, None otherwise
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
for standard in self.standards.values():
|
||||
if standard.name.lower() == name_lower:
|
||||
return standard
|
||||
return None
|
||||
|
||||
async def upload_standard(self, file: BinaryIO, filename: str) -> Standard:
|
||||
"""
|
||||
Upload and process a standard definition file.
|
||||
|
||||
Args:
|
||||
file: The standard definition file (JSON)
|
||||
filename: Name of the uploaded file
|
||||
|
||||
Returns:
|
||||
Standard object
|
||||
"""
|
||||
try:
|
||||
# Read file content
|
||||
content = await self._read_file_content(file)
|
||||
|
||||
# Parse JSON
|
||||
data = json.loads(content)
|
||||
|
||||
if "standards" in data and isinstance(data["standards"], list):
|
||||
# Multiple standards in file
|
||||
standards = []
|
||||
for std_data in data["standards"]:
|
||||
standard = self._create_standard_from_data(std_data)
|
||||
self.standards[standard.id] = standard
|
||||
standards.append(standard)
|
||||
logger.info(f"Uploaded standard: {standard.name} (ID: {standard.id}) with {len(standard.requirements)} requirements")
|
||||
|
||||
# Log the current standards count after upload
|
||||
logger.info(f"Total standards in system after upload: {len(self.standards)}")
|
||||
|
||||
# Return the first standard for simplicity
|
||||
return standards[0] if standards else None
|
||||
else:
|
||||
# Single standard in file
|
||||
standard = self._create_standard_from_data(data)
|
||||
self.standards[standard.id] = standard
|
||||
logger.info(f"Uploaded standard: {standard.name} (ID: {standard.id}) with {len(standard.requirements)} requirements")
|
||||
|
||||
# Log the current standards count after upload
|
||||
logger.info(f"Total standards in system after upload: {len(self.standards)}")
|
||||
|
||||
return standard
|
||||
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Invalid JSON format in standard definition file")
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing standard file: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _read_file_content(self, file: BinaryIO) -> str:
|
||||
"""
|
||||
Read and decode file content.
|
||||
|
||||
Args:
|
||||
file: The file to read
|
||||
|
||||
Returns:
|
||||
File content as string
|
||||
"""
|
||||
file_content = file.read()
|
||||
|
||||
# Try to decode as UTF-8
|
||||
try:
|
||||
return file_content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# Try other encodings if UTF-8 fails
|
||||
try:
|
||||
return file_content.decode('latin-1')
|
||||
except:
|
||||
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
|
||||
|
||||
async def get_standard_names_for_document(self, document_content: str) -> List[str]:
|
||||
"""
|
||||
Identify which standards might be relevant for a document based on content.
|
||||
Uses advanced matching logic to find the most relevant standards.
|
||||
|
||||
Args:
|
||||
document_content: The document content
|
||||
|
||||
Returns:
|
||||
List of standard names that might be relevant
|
||||
"""
|
||||
# Default standards to use if no matches are found
|
||||
DEFAULT_STANDARDS = ["ISO-9001", "IEEE-829", "RFC-2119"]
|
||||
|
||||
# Log available standards for debugging
|
||||
logger.info(f"Available standards in the system: {len(self.standards)}")
|
||||
for std_id, std in self.standards.items():
|
||||
logger.info(f" - {std.name} (ID: {std_id})")
|
||||
|
||||
# If no standards are available, return defaults
|
||||
if not self.standards:
|
||||
logger.warning("No standards available in the system. Using default standards.")
|
||||
return DEFAULT_STANDARDS
|
||||
|
||||
# Use the standards matcher to find relevant standards
|
||||
standard_scores = self.matcher.find_relevant_standards(
|
||||
document_content=document_content,
|
||||
standards=list(self.standards.values()),
|
||||
threshold=0.1, # Minimum relevance threshold
|
||||
max_standards=5 # Maximum number of standards to return
|
||||
)
|
||||
|
||||
# Log the matching results
|
||||
if standard_scores:
|
||||
logger.info(f"Found {len(standard_scores)} relevant standards:")
|
||||
for name, score in standard_scores:
|
||||
logger.info(f" - {name}: relevance score {score:.2f}")
|
||||
else:
|
||||
logger.info("No relevant standards found based on document content.")
|
||||
|
||||
# Extract standard names from the results
|
||||
relevant_standards = [std[0] for std in standard_scores]
|
||||
|
||||
# If no relevant standards found, use defaults
|
||||
if not relevant_standards:
|
||||
logger.info(f"Using default standards: {DEFAULT_STANDARDS}")
|
||||
return DEFAULT_STANDARDS
|
||||
|
||||
return relevant_standards
|
||||
@@ -0,0 +1,304 @@
|
||||
# Standards matching logic
|
||||
import re
|
||||
from typing import Dict, List, Set, Tuple, Optional
|
||||
from loguru import logger
|
||||
|
||||
from app.core.models import Standard, Requirement
|
||||
|
||||
|
||||
class StandardsMatcher:
|
||||
"""
|
||||
Advanced matching logic to identify relevant standards for documents.
|
||||
This class implements sophisticated matching algorithms beyond simple text matching.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the standards matcher."""
|
||||
# Common stopwords to filter out when extracting keywords
|
||||
self.stopwords = {
|
||||
"the", "a", "an", "and", "or", "in", "on", "at", "to", "for", "with",
|
||||
"by", "of", "is", "are", "was", "were", "be", "been", "being", "have",
|
||||
"has", "had", "do", "does", "did", "but", "if", "then", "else", "when",
|
||||
"where", "why", "how", "all", "any", "both", "each", "few", "more",
|
||||
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
|
||||
"same", "so", "than", "too", "very", "can", "will", "just", "should",
|
||||
"now", "this", "that", "these", "those"
|
||||
}
|
||||
|
||||
# Technical terms that indicate compliance requirements
|
||||
self.technical_indicators = [
|
||||
"shall", "must", "required", "should", "recommended", "may", "optional",
|
||||
"compliant", "compliance", "conform", "standard", "specification", "requirement",
|
||||
"procedure", "process", "method", "test", "verify", "validate", "certification",
|
||||
"certified", "approved", "regulation", "regulatory", "guideline", "protocol"
|
||||
]
|
||||
|
||||
# Common standard prefixes and abbreviations
|
||||
self.standard_prefixes = [
|
||||
"iso", "ieee", "astm", "ansi", "iec", "din", "bs", "en", "jis",
|
||||
"gb", "api", "asme", "nfpa", "ul", "mil", "std", "rfc", "itu"
|
||||
]
|
||||
|
||||
def extract_document_sections(self, document_content: str) -> Dict[str, str]:
|
||||
"""
|
||||
Extract sections from a document to improve matching.
|
||||
|
||||
Args:
|
||||
document_content: The document content
|
||||
|
||||
Returns:
|
||||
Dictionary of section name to section content
|
||||
"""
|
||||
sections = {}
|
||||
sections["full_document"] = document_content
|
||||
|
||||
# Try to identify document sections using markdown headings
|
||||
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
|
||||
matches = list(heading_pattern.finditer(document_content))
|
||||
|
||||
if matches:
|
||||
for i, match in enumerate(matches):
|
||||
section_name = match.group(2).strip()
|
||||
|
||||
# Get section content (from this heading to the next, or to the end)
|
||||
start_pos = match.end()
|
||||
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(document_content)
|
||||
|
||||
section_content = document_content[start_pos:end_pos].strip()
|
||||
sections[section_name] = section_content
|
||||
|
||||
# Look for common document sections by name
|
||||
common_sections = [
|
||||
"introduction", "scope", "purpose", "references", "definitions",
|
||||
"requirements", "compliance", "standards", "conclusion", "summary",
|
||||
"appendix", "annex"
|
||||
]
|
||||
|
||||
for section in common_sections:
|
||||
pattern = re.compile(rf'(?i)(?:^|\n)(?:{section}|{section.capitalize()})(?:[\s:]+)(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)', re.DOTALL)
|
||||
match = pattern.search(document_content)
|
||||
if match:
|
||||
sections[section] = match.group(1).strip()
|
||||
|
||||
return sections
|
||||
|
||||
def extract_key_terms(self, document_content: str) -> List[str]:
|
||||
"""
|
||||
Extract key technical terms from document content.
|
||||
|
||||
Args:
|
||||
document_content: The document content
|
||||
|
||||
Returns:
|
||||
List of key terms
|
||||
"""
|
||||
key_terms = []
|
||||
|
||||
# Split into sentences
|
||||
sentences = re.split(r'[.!?]\s+', document_content)
|
||||
|
||||
for sentence in sentences:
|
||||
words = sentence.split()
|
||||
|
||||
# Check if sentence contains technical indicators
|
||||
if any(indicator in sentence.lower() for indicator in self.technical_indicators):
|
||||
# Extract noun phrases (simplified approach)
|
||||
for i in range(len(words) - 1):
|
||||
if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
|
||||
key_terms.append(f"{words[i]} {words[i+1]}".lower())
|
||||
|
||||
# Look for capitalized terms (often defined terms)
|
||||
cap_pattern = re.compile(r'\b[A-Z][A-Z0-9]+\b')
|
||||
cap_terms = cap_pattern.findall(document_content)
|
||||
key_terms.extend([term.lower() for term in cap_terms])
|
||||
|
||||
# Look for standard references (e.g., ISO-9001, IEEE 829)
|
||||
for prefix in self.standard_prefixes:
|
||||
pattern = re.compile(rf'\b{prefix}[-\s]?\d+\b', re.IGNORECASE)
|
||||
matches = pattern.findall(document_content)
|
||||
key_terms.extend([match.lower() for match in matches])
|
||||
|
||||
# Remove duplicates
|
||||
return list(set(key_terms))
|
||||
|
||||
def extract_standard_keywords(self, standard: Standard) -> List[str]:
|
||||
"""
|
||||
Extract keywords from a standard that can be used for matching.
|
||||
|
||||
Args:
|
||||
standard: The standard to extract keywords from
|
||||
|
||||
Returns:
|
||||
List of keywords associated with the standard
|
||||
"""
|
||||
keywords = []
|
||||
|
||||
# Add standard name and variations
|
||||
keywords.append(standard.name.lower())
|
||||
keywords.append(standard.name.replace("-", "").lower())
|
||||
keywords.append(standard.name.replace("-", " ").lower())
|
||||
|
||||
# Add standard description words (excluding common words)
|
||||
if standard.description:
|
||||
description_words = [word.lower() for word in standard.description.split()
|
||||
if word.lower() not in self.stopwords]
|
||||
keywords.extend(description_words)
|
||||
|
||||
# Add requirement keywords
|
||||
for req in standard.requirements:
|
||||
# Add requirement ID
|
||||
keywords.append(req.id.lower())
|
||||
|
||||
# Add key phrases from requirement description
|
||||
if req.description:
|
||||
# Extract noun phrases and technical terms (simplified approach)
|
||||
phrases = []
|
||||
words = req.description.split()
|
||||
for i in range(len(words) - 1):
|
||||
if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
|
||||
phrases.append(f"{words[i]} {words[i+1]}".lower())
|
||||
keywords.extend(phrases)
|
||||
|
||||
# Add individual technical terms
|
||||
for word in words:
|
||||
if word.lower() in self.technical_indicators:
|
||||
keywords.append(word.lower())
|
||||
|
||||
# Remove duplicates and return
|
||||
return list(set(keywords))
|
||||
|
||||
def calculate_standard_relevance(self, standard: Standard, document_content: str,
|
||||
sections: Dict[str, str], key_terms: List[str]) -> float:
|
||||
"""
|
||||
Calculate a relevance score for a standard based on multiple factors.
|
||||
|
||||
Args:
|
||||
standard: The standard to evaluate
|
||||
document_content: The document content
|
||||
sections: Document sections
|
||||
key_terms: Key terms extracted from the document
|
||||
|
||||
Returns:
|
||||
Relevance score (0.0 to 1.0)
|
||||
"""
|
||||
document_content_lower = document_content.lower()
|
||||
|
||||
# Extract keywords for this standard
|
||||
standard_keywords = self.extract_standard_keywords(standard)
|
||||
|
||||
# Initialize scores for different matching components
|
||||
name_match_score = 0.0
|
||||
keyword_match_score = 0.0
|
||||
section_match_score = 0.0
|
||||
term_match_score = 0.0
|
||||
requirement_match_score = 0.0
|
||||
|
||||
# 1. Check for standard name matches (highest weight)
|
||||
if standard.name.lower() in document_content_lower:
|
||||
name_match_score = 0.5
|
||||
elif standard.name.replace("-", "").lower() in document_content_lower:
|
||||
name_match_score = 0.4
|
||||
elif standard.name.replace("-", " ").lower() in document_content_lower:
|
||||
name_match_score = 0.4
|
||||
|
||||
# 2. Check for keyword matches
|
||||
matched_keywords = 0
|
||||
total_keywords = len(standard_keywords)
|
||||
|
||||
if total_keywords > 0:
|
||||
for keyword in standard_keywords:
|
||||
if keyword in document_content_lower:
|
||||
matched_keywords += 1
|
||||
|
||||
keyword_match_score = matched_keywords / total_keywords * 0.3
|
||||
|
||||
# 3. Check for section-specific matches
|
||||
important_sections = ["introduction", "scope", "purpose", "references",
|
||||
"standards", "compliance", "requirements"]
|
||||
|
||||
for section_name in important_sections:
|
||||
if section_name in sections:
|
||||
section_content = sections[section_name].lower()
|
||||
|
||||
# Check for standard name in important sections
|
||||
if standard.name.lower() in section_content:
|
||||
section_match_score += 0.1
|
||||
break
|
||||
|
||||
# Check for standard name in section titles
|
||||
for section_name in sections.keys():
|
||||
if standard.name.lower() in section_name.lower():
|
||||
section_match_score += 0.2
|
||||
break
|
||||
|
||||
# 4. Check for key term matches
|
||||
matching_terms = 0
|
||||
for term in key_terms:
|
||||
if any(kw in term or term in kw for kw in standard_keywords):
|
||||
matching_terms += 1
|
||||
|
||||
if len(key_terms) > 0:
|
||||
term_match_score = min(0.2, 0.01 * matching_terms)
|
||||
|
||||
# 5. Check for requirement-specific matches
|
||||
for req in standard.requirements:
|
||||
req_desc_lower = req.description.lower()
|
||||
req_keywords = [word for word in req_desc_lower.split()
|
||||
if word not in self.stopwords and len(word) > 3]
|
||||
|
||||
for keyword in req_keywords:
|
||||
if keyword in document_content_lower:
|
||||
requirement_match_score += 0.01
|
||||
|
||||
requirement_match_score = min(0.2, requirement_match_score)
|
||||
|
||||
# Calculate final score (weighted sum of all components)
|
||||
final_score = (
|
||||
name_match_score +
|
||||
keyword_match_score +
|
||||
section_match_score +
|
||||
term_match_score +
|
||||
requirement_match_score
|
||||
)
|
||||
|
||||
# Cap at 1.0
|
||||
return min(final_score, 1.0)
|
||||
|
||||
def find_relevant_standards(self, document_content: str, standards: List[Standard],
|
||||
threshold: float = 0.1, max_standards: int = 5) -> List[Tuple[str, float]]:
|
||||
"""
|
||||
Find standards relevant to a document with relevance scores.
|
||||
|
||||
Args:
|
||||
document_content: The document content
|
||||
standards: List of available standards
|
||||
threshold: Minimum relevance score threshold
|
||||
max_standards: Maximum number of standards to return
|
||||
|
||||
Returns:
|
||||
List of tuples (standard_name, relevance_score) sorted by relevance
|
||||
"""
|
||||
if not standards:
|
||||
return []
|
||||
|
||||
# Extract document sections and key terms
|
||||
sections = self.extract_document_sections(document_content)
|
||||
key_terms = self.extract_key_terms(document_content)
|
||||
|
||||
# Calculate relevance scores for each standard
|
||||
standard_scores = []
|
||||
|
||||
for standard in standards:
|
||||
score = self.calculate_standard_relevance(
|
||||
standard, document_content, sections, key_terms
|
||||
)
|
||||
|
||||
if score >= threshold:
|
||||
standard_scores.append((standard.name, score))
|
||||
logger.debug(f"Standard {standard.name} relevance score: {score:.2f}")
|
||||
|
||||
# Sort by relevance score (highest first)
|
||||
standard_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Limit to max_standards
|
||||
return standard_scores[:max_standards]
|
||||
Reference in New Issue
Block a user