Initial commit

This commit is contained in:
Aherobo Ovie Victor
2025-07-17 22:20:25 +01:00
commit 0e3e22e8cb
39 changed files with 13295 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
"""Services for the Mini SpecsComply Pro application."""
+461
View File
@@ -0,0 +1,461 @@
# Document processing
import os
import uuid
from datetime import datetime, timedelta
from typing import Dict, List, Optional, BinaryIO, Tuple
import re
from loguru import logger
from app.core.models import (
Document,
DocumentMetadata,
DocumentStatus,
ComplianceReport,
ComplianceIssue,
ComplianceLevel,
DocumentEmbedding
)
from app.services.embedding import EmbeddingService
from app.services.reasoning import ReasoningService
from app.services.standards import StandardsService
from app.utils.token_counter import count_tokens, truncate_by_tokens
class DocumentService:
"""Service for handling document processing and storage."""
def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
"""Initialize with required services."""
self.embedding_service = embedding_service
self.reasoning_service = reasoning_service
self.standards_service = standards_service or StandardsService()
self.documents = {} # In-memory storage for documents (replace with DB in production)
self.reports = {} # In-memory storage for reports (replace with DB in production)
async def upload_document(self, file: BinaryIO, filename: str) -> Document:
"""
Process an uploaded document.
Args:
file: The document file
filename: Name of the uploaded file
Returns:
Document object with metadata
"""
# Validate file type
if not self._validate_file_type(filename):
raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")
# Get file content
content = await self._read_file_content(file)
# Extract file metadata
file_size = len(content)
file_type = self._get_file_type(filename)
# Create document metadata
metadata = DocumentMetadata(
filename=filename,
file_type=file_type,
file_size=file_size,
upload_timestamp=datetime.now(),
last_modified=datetime.now()
)
# Create document object
document_id = str(uuid.uuid4())
document = Document(
id=document_id,
metadata=metadata,
status=DocumentStatus.PENDING,
version=1
)
# Store document in memory
self.documents[document_id] = document
# Start processing
try:
await self._process_document(document_id, content)
except Exception as e:
logger.error(f"Error processing document {document_id}: {str(e)}")
document.status = DocumentStatus.FAILED
raise
return document
async def get_document(self, document_id: str) -> Optional[Document]:
"""
Retrieve a document by ID.
Args:
document_id: The ID of the document to retrieve
Returns:
Document object if found, None otherwise
"""
return self.documents.get(document_id)
async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
"""
Retrieve a compliance report by ID.
Args:
report_id: The ID of the report to retrieve
Returns:
ComplianceReport object if found, None otherwise
"""
return self.reports.get(report_id)
async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
"""
Resubmit a document with changes.
Args:
document_id: The ID of the document to resubmit
file: The updated document file
Returns:
Updated Document object
"""
# Check if document exists
document = await self.get_document(document_id)
if not document:
raise ValueError(f"Document with ID {document_id} not found")
# Get file content
content = await self._read_file_content(file)
# Update document metadata
document.metadata.file_size = len(content)
document.metadata.last_modified = datetime.now()
document.version += 1
document.status = DocumentStatus.PENDING
# Process the updated document
try:
await self._process_document(document_id, content)
except Exception as e:
logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
document.status = DocumentStatus.FAILED
return document
async def process_document(self, document_id: str, content: str) -> ComplianceReport:
"""
Process document and generate compliance report.
Args:
document_id: The ID of the document
content: Document content
Returns:
ComplianceReport object
"""
try:
# Get the document
document = self.documents.get(document_id)
if not document:
raise ValueError(f"Document {document_id} not found")
# Split document into sections
sections = self._split_into_sections(content)
# Generate embeddings for sections
document.embedding = await self.embedding_service.embed_document(document_id, sections)
# Identify relevant standards for the document
if self.standards_service:
# Log the standards service instance ID to verify singleton pattern
logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")
standard_names = await self.standards_service.get_standard_names_for_document(content)
logger.info(f"Identified standards for document {document_id}: {standard_names}")
else:
logger.warning(f"No StandardsService available for document {document_id}")
standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]
# Use reasoning service for compliance analysis
report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)
# Store the report
self.reports[report.report_id] = report
return report
except Exception as e:
logger.error(f"Error in document processing: {str(e)}")
raise
async def _read_file_content(self, file: BinaryIO) -> str:
"""
Read and decode file content.
Args:
file: The file to read
Returns:
File content as string
"""
file_content = file.read()
# Try to decode as UTF-8
try:
return file_content.decode('utf-8')
except UnicodeDecodeError:
# Try other encodings if UTF-8 fails
try:
return file_content.decode('latin-1')
except:
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
def _get_file_type(self, filename: str) -> str:
"""
Determine file type from filename.
Args:
filename: The name of the file
Returns:
File type (extension)
"""
_, extension = os.path.splitext(filename)
return extension.lstrip('.').lower()
def _validate_file_type(self, filename: str) -> bool:
"""
Validate if the file type is supported.
Args:
filename: Name of the file to validate
Returns:
bool: True if file type is supported, False otherwise
"""
SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
_, ext = os.path.splitext(filename)
return ext.lower() in SUPPORTED_EXTENSIONS
def _split_into_sections(self, content: str) -> Dict[str, str]:
"""
Split document content into sections.
Args:
content: The document content
Returns:
Dictionary mapping section names to section content
"""
# This is a simple implementation - in production, you would use more advanced
# techniques like heading detection, markdown parsing, etc.
# For simplicity, we'll just split by markdown headings
sections = {}
# Add the whole document as one section
sections["full_document"] = content
# Try to split by markdown headings
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
matches = list(heading_pattern.finditer(content))
if matches:
for i, match in enumerate(matches):
heading_level = len(match.group(1))
section_name = match.group(2).strip()
# Get section content (from this heading to the next, or to the end)
start_pos = match.end()
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)
section_content = content[start_pos:end_pos].strip()
section_key = f"h{heading_level}_{section_name}"
sections[section_key] = section_content
else:
# No headings found, try to split by newlines into paragraphs
paragraphs = [p for p in content.split('\n\n') if p.strip()]
for i, paragraph in enumerate(paragraphs):
if len(paragraph) > 100: # Only include substantial paragraphs
sections[f"paragraph_{i+1}"] = paragraph
return sections
async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
"""
Generate a mock compliance report for development/testing.
Args:
document_id: The ID of the document
sections: Dictionary of document sections
Returns:
ComplianceReport object
"""
# In production, this would use the reasoning service
# For now, we'll generate a simple mock report
# Create some mock issues
issues = []
if "full_document" in sections:
content = sections["full_document"]
# Check for missing sections (mock check)
if "introduction" not in content.lower():
issues.append(ComplianceIssue(
section="Document Structure",
description="Missing introduction section",
level=ComplianceLevel.MAJOR,
recommendation="Add an introduction section to provide context for the document"
))
# Check for formatting issues (mock check)
if content.count('#') < 3:
issues.append(ComplianceIssue(
section="Formatting",
description="Insufficient section headings",
level=ComplianceLevel.MINOR,
recommendation="Use markdown headings to better structure the document"
))
# Check for technical compliance (mock check)
if "compliance" in content.lower() and "standard" not in content.lower():
issues.append(ComplianceIssue(
section="Technical Content",
description="Mentions compliance but doesn't reference specific standards",
level=ComplianceLevel.CRITICAL,
recommendation="Specify which standards or regulations the document complies with"
))
# Calculate mock compliance score
if issues:
compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
else:
compliance_score = 1.0
# Create summary based on issues
if not issues:
summary = "The document meets all compliance requirements. No issues found."
else:
critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)
summary = f"The document has {len(issues)} compliance issues: "
if critical_count:
summary += f"{critical_count} critical, "
if major_count:
summary += f"{major_count} major, "
if minor_count:
summary += f"{minor_count} minor."
else:
summary = summary.rstrip(", ") + "."
summary += " See detailed report for recommendations."
# Create report
report = ComplianceReport(
document_id=document_id,
compliance_score=compliance_score,
summary=summary,
issues=issues
)
return report
async def _process_document(self, document_id: str, content: str) -> None:
"""
Internal method to process a document and update its status.
Args:
document_id: The ID of the document to process
content: The document content
"""
try:
# Get the document
document = self.documents.get(document_id)
if not document:
raise ValueError(f"Document {document_id} not found")
# Update status to processing
document.status = DocumentStatus.PROCESSING
# Generate compliance report
report = await self.process_document(document_id, content)
# Store report ID in document
document.reports.append(report.report_id)
# Update document status
document.status = DocumentStatus.COMPLETED
except Exception as e:
# Update document status to failed
if document:
document.status = DocumentStatus.FAILED
raise
async def get_document_stats(self, document_id: str) -> Dict[str, any]:
"""
Get statistics for a document.
Args:
document_id: The ID of the document
Returns:
Dictionary containing document statistics
"""
document = await self.get_document(document_id)
if not document:
raise ValueError(f"Document {document_id} not found")
latest_report = None
if document.reports:
latest_report = await self.get_report(document.reports[-1])
stats = {
"document_id": document_id,
"version": document.version,
"status": document.status,
"file_size": document.metadata.file_size,
"upload_date": document.metadata.upload_timestamp,
"last_modified": document.metadata.last_modified,
"num_reports": len(document.reports),
"latest_compliance_score": latest_report.compliance_score if latest_report else None,
"critical_issues": latest_report.critical_issues_count if latest_report else 0,
"major_issues": latest_report.major_issues_count if latest_report else 0,
"minor_issues": latest_report.minor_issues_count if latest_report else 0
}
return stats
async def cleanup_old_documents(self, days: int = 30) -> List[str]:
"""
Remove documents older than specified days.
Args:
days: Number of days after which documents should be removed
Returns:
List of removed document IDs
"""
cutoff_date = datetime.now() - timedelta(days=days)
removed_ids = []
for doc_id, document in list(self.documents.items()):
if document.metadata.upload_timestamp < cutoff_date:
# Remove associated reports
for report_id in document.reports:
self.reports.pop(report_id, None)
# Remove document
self.documents.pop(doc_id)
removed_ids.append(doc_id)
return removed_ids
+254
View File
@@ -0,0 +1,254 @@
import cohere
from typing import List, Dict, Any, Optional
import uuid
from pinecone import Pinecone
import weaviate
from loguru import logger
from app.core.config import settings
from app.core.models import DocumentEmbedding
class EmbeddingService:
"""Service for document embedding and vector database operations."""
def __init__(self):
"""Initialize the embedding service with the Cohere client and vector DB."""
# Initialize Cohere client
self.cohere_client = cohere.Client(settings.COHERE_API_KEY)
# Initialize vector database client based on configuration
self.vector_db_client = self._init_vector_db()
self.embedding_model = settings.EMBEDDING_MODEL
def _init_vector_db(self) -> Any:
"""Initialize the vector database client based on settings."""
if settings.VECTOR_DB == "pinecone" and settings.PINECONE_API_KEY:
# Initialize Pinecone with new API
pc = Pinecone(api_key=settings.PINECONE_API_KEY)
# Check if index exists, if not create it
if settings.PINECONE_INDEX_NAME not in [idx["name"] for idx in pc.list_indexes()]:
pc.create_index(
name=settings.PINECONE_INDEX_NAME,
dimension=1024, # Cohere embed-english-v3.0 dimension
metric="cosine"
)
# Return the index
return pc.Index(settings.PINECONE_INDEX_NAME)
elif settings.VECTOR_DB == "weaviate" and settings.WEAVIATE_URL:
# Initialize Weaviate
auth_config = weaviate.auth.AuthApiKey(api_key=settings.WEAVIATE_API_KEY) if settings.WEAVIATE_API_KEY else None
client = weaviate.Client(
url=settings.WEAVIATE_URL,
auth_client_secret=auth_config
)
# Check if schema exists, if not create it
if not client.schema.contains().get("classes", []):
class_obj = {
"class": "Document",
"vectorizer": "none", # We'll provide our own vectors
"properties": [
{
"name": "content",
"dataType": ["text"]
},
{
"name": "document_id",
"dataType": ["string"]
},
{
"name": "section_name",
"dataType": ["string"]
}
]
}
client.schema.create_class(class_obj)
return client
else:
logger.warning("No valid vector database configuration found. Using mock implementation.")
return MockVectorDB()
async def embed_document(self, document_id: str, sections: Dict[str, str]) -> DocumentEmbedding:
"""
Embed document sections and store in vector database.
Args:
document_id: Unique identifier for the document
sections: Dictionary mapping section names to section content
Returns:
DocumentEmbedding object with embedding metadata
"""
section_ids = {}
for section_name, content in sections.items():
# Generate embedding for section content
try:
embedding_response = self.cohere_client.embed(
texts=[content],
model=self.embedding_model,
input_type="search_document"
)
embedding_vector = embedding_response.embeddings[0]
# Generate a unique ID for this section
section_id = f"{document_id}_{section_name}_{str(uuid.uuid4())[:8]}"
# Store in vector database
if settings.VECTOR_DB == "pinecone":
self.vector_db_client.upsert(
vectors=[{
"id": section_id,
"values": embedding_vector,
"metadata": {
"document_id": document_id,
"section_name": section_name,
"content": content[:1000] # Store truncated content for context
}
}],
namespace=document_id
)
elif settings.VECTOR_DB == "weaviate":
self.vector_db_client.data_object.create(
class_name="Document",
data_object={
"content": content,
"document_id": document_id,
"section_name": section_name
},
uuid=section_id,
vector=embedding_vector
)
# Store the section ID
section_ids[section_name] = section_id
logger.info(f"Successfully embedded section '{section_name}' for document {document_id}")
except Exception as e:
logger.error(f"Error embedding section '{section_name}': {str(e)}")
raise
# Create and return DocumentEmbedding object
embedding = DocumentEmbedding(
embedding_id=str(uuid.uuid4()),
embedding_model=self.embedding_model,
vector_db=settings.VECTOR_DB,
sections=section_ids
)
return embedding
async def retrieve_similar_sections(self, query: str, document_id: Optional[str] = None, top_k: int = 5) -> List[Dict[str, Any]]:
"""
Retrieve similar document sections for a query.
Args:
query: The query text to find similar sections for
document_id: Optional document ID to restrict search
top_k: Number of results to return
Returns:
List of similar sections with metadata
"""
# Generate embedding for query
query_embedding = self.cohere_client.embed(
texts=[query],
model=self.embedding_model,
input_type="search_query"
).embeddings[0]
# Search vector database
if settings.VECTOR_DB == "pinecone":
namespace = document_id if document_id else None
results = self.vector_db_client.query(
vector=query_embedding,
top_k=top_k,
namespace=namespace,
include_metadata=True
)
# Format results
similar_sections = []
for match in results.matches:
similar_sections.append({
"section_id": match.id,
"document_id": match.metadata["document_id"],
"section_name": match.metadata["section_name"],
"content": match.metadata.get("content", ""),
"score": match.score
})
elif settings.VECTOR_DB == "weaviate":
query_builder = self.vector_db_client.query.get(
"Document", ["content", "document_id", "section_name"]
).with_near_vector({
"vector": query_embedding
}).with_limit(top_k)
if document_id:
query_builder = query_builder.with_where({
"path": ["document_id"],
"operator": "Equal",
"valueString": document_id
})
results = query_builder.do()
# Format results
similar_sections = []
for item in results.get("data", {}).get("Get", {}).get("Document", []):
similar_sections.append({
"section_id": item.get("_additional", {}).get("id"),
"document_id": item.get("document_id"),
"section_name": item.get("section_name"),
"content": item.get("content", ""),
"score": item.get("_additional", {}).get("distance")
})
else:
# Mock implementation
similar_sections = []
return similar_sections
class MockVectorDB:
"""Mock vector database for development without actual vector DB."""
def __init__(self):
self.vectors = {}
logger.warning("Using mock vector database. Not suitable for production.")
def upsert(self, vectors, namespace=None):
"""Mock upsert method."""
namespace = namespace or "default"
if namespace not in self.vectors:
self.vectors[namespace] = {}
for vector in vectors:
vector_id = vector['id']
metadata = vector['metadata']
self.vectors[namespace][vector_id] = metadata
def query(self, vector, top_k=5, namespace=None, include_metadata=True):
"""Mock query method."""
from collections import namedtuple
namespace = namespace or "default"
if namespace not in self.vectors:
return []
# Just return some mock results
Match = namedtuple('Match', ['id', 'score', 'metadata'])
Results = namedtuple('Results', ['matches'])
matches = [
Match(id=vector_id, score=0.8, metadata=metadata)
for vector_id, metadata in list(self.vectors[namespace].items())[:top_k]
]
return Results(matches=matches)
+136
View File
@@ -0,0 +1,136 @@
# Reranking services
import cohere
from typing import List, Dict, Any
from loguru import logger
from tenacity import retry, stop_after_attempt, wait_exponential
from app.core.config import settings
from app.core.models import ComplianceIssue, ComplianceReport, ComplianceLevel
class RankingService:
"""Service for ranking and prioritizing compliance issues using Cohere Reranker."""
def __init__(self):
"""Initialize the ranking service with the Cohere client."""
self.cohere_client = cohere.Client(settings.COHERE_API_KEY)
self.reranker_model = settings.RERANKER_MODEL
async def prioritize_issues(self, report: ComplianceReport, max_issues: int = 10) -> ComplianceReport:
"""
Prioritize and rank compliance issues in a report.
Args:
report: The compliance report with issues to prioritize
max_issues: Maximum number of issues to include in the final report
Returns:
Updated compliance report with prioritized issues
"""
if not report.issues or len(report.issues) <= 1:
# No need to rank if there's only 0 or 1 issues
return report
try:
# Prepare issues for ranking
issue_texts = [
f"Section: {issue.section}. "
f"Level: {issue.level.value}. "
f"Description: {issue.description}. "
f"Recommendation: {issue.recommendation}"
for issue in report.issues
]
# Query object representing what we're looking for
query = "critical compliance issues that require immediate attention"
# Rerank issues based on relevance to the query
reranked_issues = await self._rerank_issues(query, issue_texts)
# Sort issues based on:
# 1. Compliance level (critical > major > minor > info)
# 2. Reranker relevance score
sorted_issues = []
level_scores = {
ComplianceLevel.CRITICAL: 4,
ComplianceLevel.MAJOR: 3,
ComplianceLevel.MINOR: 2,
ComplianceLevel.INFO: 1
}
# Combine original issues with reranked scores
combined_issues = []
for i, issue in enumerate(report.issues):
rerank_score = next((item["relevance_score"] for item in reranked_issues
if item["index"] == i), 0.0)
# Calculate combined score (level_score * 100 + rerank_score)
# This ensures level is always the primary sorting factor
level_score = level_scores.get(issue.level, 0)
combined_score = (level_score * 100) + rerank_score
combined_issues.append({
"issue": issue,
"combined_score": combined_score,
"rerank_score": rerank_score
})
# Sort by combined score (descending)
combined_issues.sort(key=lambda x: x["combined_score"], reverse=True)
# Take top issues based on max_issues limit
sorted_issues = [item["issue"] for item in combined_issues[:max_issues]]
# Create updated report
prioritized_report = ComplianceReport(
report_id=report.report_id,
document_id=report.document_id,
timestamp=report.timestamp,
compliance_score=report.compliance_score,
summary=report.summary,
issues=sorted_issues
)
return prioritized_report
except Exception as e:
logger.error(f"Error prioritizing issues: {str(e)}")
# If ranking fails, return the original report
return report
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
async def _rerank_issues(self, query: str, issue_texts: List[str]) -> List[Dict[str, Any]]:
"""
Rerank issues using Cohere Reranker.
Args:
query: The search query to compare issues against
issue_texts: List of issue descriptions to rank
Returns:
List of dictionaries with reranked issues and scores
"""
try:
# Call Cohere Rerank endpoint
response = self.cohere_client.rerank(
model=self.reranker_model,
query=query,
documents=issue_texts,
top_n=len(issue_texts)
)
# Format results
reranked_issues = []
for result in response.results:
reranked_issues.append({
"index": result.index, # Original index in the issues list
"relevance_score": result.relevance_score
})
return reranked_issues
except Exception as e:
logger.error(f"Error calling Cohere Reranker: {str(e)}")
# Return basic ranking if reranking fails
return [{"index": i, "relevance_score": 1.0 - (i * 0.1)}
for i in range(len(issue_texts))]
+168
View File
@@ -0,0 +1,168 @@
# Reasoning with LLMs
# Reasoning with LLMs using GROQ
import json
from typing import Dict, List
from loguru import logger
from tenacity import retry, stop_after_attempt, wait_exponential
from app.core.config import settings
from app.core.models import ComplianceIssue, ComplianceLevel, ComplianceReport
from app.utils.token_counter import count_tokens, truncate_by_tokens
from groq import Groq # Assuming groq Python SDK is installed
class ReasoningService:
"""Service for performing deep reasoning on documents using Groq."""
def __init__(self):
"""Initialize the reasoning service with the Groq client."""
self.client = Groq(api_key=settings.GROQ_API_KEY)
self.model = settings.REASONING_MODEL # e.g., "mixtral-8x7b-32768"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
async def analyze_document(self, document_id: str, sections: Dict[str, str], standards: List[str]) -> ComplianceReport:
document_content = "\n\n".join([f"# {name}\n{content}" for name, content in sections.items()])
# Use token-based truncation instead of character-based
max_tokens = 30000 # Adjust based on model context window
token_count = count_tokens(document_content)
logger.info(f"Document {document_id} has {token_count} tokens before truncation")
if token_count > max_tokens:
document_content = truncate_by_tokens(document_content, max_tokens)
logger.info(f"Document {document_id} truncated to {max_tokens} tokens")
prompt = self._create_analysis_prompt(document_content, standards)
try:
response = await self._query_groq(prompt)
compliance_report = self._parse_compliance_response(document_id, response, standards)
return compliance_report
except Exception as e:
logger.error(f"Error analyzing document with Groq: {str(e)}")
raise
def _create_analysis_prompt(self, document_content: str, standards: List[str]) -> str:
standards_text = "\n".join([f"- {standard}" for standard in standards])
return f"""<document>
{document_content}
</document>
<standards>
{standards_text}
</standards>
You are an expert in document compliance and technical specifications. Please analyze the document above against the listed standards.
Your job is to identify compliance issues and provide detailed reasoning and recommendations. Focus on:
1. Technical accuracy and completeness
2. Compliance with the specified standards
3. Document structure and organization
4. Clarity and specificity of language
5. Consistency and coherence
For each compliance issue you find, please provide:
- The section where the issue appears
- A detailed description of the issue
- The severity level (critical, major, minor, or info)
- A thorough explanation of why this is an issue and how it impacts compliance
- Specific, actionable recommendations to fix the issue
- References to specific standards or best practices that apply
Respond in the following JSON format:
{{
"summary": "Comprehensive overall assessment of the document",
"compliance_score": 0.0 to 1.0,
"issues": [
{{
"section": "Section name",
"description": "Detailed issue description",
"level": "critical/major/minor/info",
"reasoning": "Thorough explanation of why this is an issue",
"standard_references": ["Specific standards or requirements that are violated"],
"recommendation": "Detailed, actionable recommendation to fix the issue"
}}
]
}}"""
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
async def _query_groq(self, prompt: str) -> str:
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are an AI assistant specialized in document compliance analysis."},
{"role": "user", "content": prompt}
],
max_tokens=4000,
temperature=0.2,
top_p=1.0
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Error querying Groq: {str(e)}")
raise
def _parse_compliance_response(self, document_id: str, response: str, standards: List[str]) -> ComplianceReport:
try:
json_start = response.find('{')
json_end = response.rfind('}') + 1
if json_start == -1 or json_end == 0:
raise ValueError("Could not find JSON in response")
json_response = response[json_start:json_end]
data = json.loads(json_response)
summary = data.get("summary", "No summary provided")
compliance_score = float(data.get("compliance_score", 0.5))
issues = []
for issue_data in data.get("issues", []):
level_str = issue_data.get("level", "minor").lower()
if level_str == "critical":
level = ComplianceLevel.CRITICAL
elif level_str == "major":
level = ComplianceLevel.MAJOR
elif level_str == "info":
level = ComplianceLevel.INFO
else:
level = ComplianceLevel.MINOR
issues.append(ComplianceIssue(
section=issue_data.get("section", "Unknown"),
description=issue_data.get("description", "No description provided"),
level=level,
reasoning=issue_data.get("reasoning", "No detailed reasoning provided"),
standard_references=issue_data.get("standard_references", []),
recommendation=issue_data.get("recommendation", "No recommendation provided")
))
return ComplianceReport(
document_id=document_id,
compliance_score=compliance_score,
summary=summary,
issues=issues,
applied_standards=standards
)
except json.JSONDecodeError:
logger.error("Failed to parse JSON from response")
return ComplianceReport(
document_id=document_id,
compliance_score=0.0,
summary="Failed to analyze document due to parsing error.",
issues=[
ComplianceIssue(
section="System",
description="Failed to parse compliance analysis results.",
level=ComplianceLevel.CRITICAL,
reasoning="The system encountered an error while parsing the compliance analysis results.",
standard_references=[],
recommendation="Please try resubmitting the document or contact support."
)
],
applied_standards=[]
)
except Exception as e:
logger.error(f"Error parsing compliance response: {str(e)}")
raise
+250
View File
@@ -0,0 +1,250 @@
# Standards management
import json
import os
from typing import Dict, List, Optional, BinaryIO, Tuple
import uuid
from loguru import logger
from app.core.models import Standard, Requirement, RequirementSeverity
from app.utils.helpers import load_standards_from_file
from app.services.standards_matcher import StandardsMatcher
# Singleton instance to ensure all parts of the application use the same standards
_standards_service_instance = None
class StandardsService:
"""Service for managing compliance standards."""
def __new__(cls):
"""Implement singleton pattern to ensure all parts of the app use the same standards."""
global _standards_service_instance
if _standards_service_instance is None:
_standards_service_instance = super(StandardsService, cls).__new__(cls)
_standards_service_instance.standards = {} # In-memory storage for standards
_standards_service_instance.matcher = StandardsMatcher() # Advanced standards matching logic
_standards_service_instance._load_default_standards()
return _standards_service_instance
def __init__(self):
"""Initialize the standards service."""
# Initialization is done in __new__ for the singleton pattern
def _load_default_standards(self):
"""Load default standards from the standards directory."""
standards_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "standard")
if not os.path.exists(standards_dir):
logger.warning(f"Standards directory not found: {standards_dir}")
return
for filename in os.listdir(standards_dir):
if filename.endswith(".json"):
try:
file_path = os.path.join(standards_dir, filename)
standards_data = load_standards_from_file(file_path)
if "standards" in standards_data:
for std_data in standards_data["standards"]:
standard = self._create_standard_from_data(std_data)
self.standards[standard.id] = standard
logger.info(f"Loaded standard: {standard.name} ({standard.id})")
except Exception as e:
logger.error(f"Error loading standard from {filename}: {str(e)}")
def _create_standard_from_data(self, data: Dict) -> Standard:
"""
Create a Standard object from dictionary data.
Args:
data: Dictionary containing standard data
Returns:
Standard object
"""
requirements = []
if "requirements" in data:
for req_data in data["requirements"]:
# Map severity string to RequirementSeverity enum
severity_str = req_data.get("severity", "minor").lower()
if severity_str == "critical":
severity = RequirementSeverity.CRITICAL
elif severity_str == "major":
severity = RequirementSeverity.MAJOR
elif severity_str == "info":
severity = RequirementSeverity.INFO
else:
severity = RequirementSeverity.MINOR
requirement = Requirement(
id=req_data.get("id", str(uuid.uuid4())),
description=req_data.get("description", ""),
severity=severity,
details=req_data.get("details", None)
)
requirements.append(requirement)
return Standard(
id=data.get("id", str(uuid.uuid4())),
name=data.get("name", "Unnamed Standard"),
description=data.get("description", ""),
requirements=requirements
)
async def get_all_standards(self) -> List[Standard]:
"""
Get all available standards.
Returns:
List of Standard objects
"""
return list(self.standards.values())
async def get_standard(self, standard_id: str) -> Optional[Standard]:
"""
Get a standard by ID.
Args:
standard_id: ID of the standard to retrieve
Returns:
Standard object if found, None otherwise
"""
return self.standards.get(standard_id)
async def get_standard_by_name(self, name: str) -> Optional[Standard]:
"""
Get a standard by name (case-insensitive).
Args:
name: Name of the standard to retrieve
Returns:
Standard object if found, None otherwise
"""
name_lower = name.lower()
for standard in self.standards.values():
if standard.name.lower() == name_lower:
return standard
return None
async def upload_standard(self, file: BinaryIO, filename: str) -> Standard:
"""
Upload and process a standard definition file.
Args:
file: The standard definition file (JSON)
filename: Name of the uploaded file
Returns:
Standard object
"""
try:
# Read file content
content = await self._read_file_content(file)
# Parse JSON
data = json.loads(content)
if "standards" in data and isinstance(data["standards"], list):
# Multiple standards in file
standards = []
for std_data in data["standards"]:
standard = self._create_standard_from_data(std_data)
self.standards[standard.id] = standard
standards.append(standard)
logger.info(f"Uploaded standard: {standard.name} (ID: {standard.id}) with {len(standard.requirements)} requirements")
# Log the current standards count after upload
logger.info(f"Total standards in system after upload: {len(self.standards)}")
# Return the first standard for simplicity
return standards[0] if standards else None
else:
# Single standard in file
standard = self._create_standard_from_data(data)
self.standards[standard.id] = standard
logger.info(f"Uploaded standard: {standard.name} (ID: {standard.id}) with {len(standard.requirements)} requirements")
# Log the current standards count after upload
logger.info(f"Total standards in system after upload: {len(self.standards)}")
return standard
except json.JSONDecodeError:
raise ValueError("Invalid JSON format in standard definition file")
except Exception as e:
logger.error(f"Error processing standard file: {str(e)}")
raise
async def _read_file_content(self, file: BinaryIO) -> str:
"""
Read and decode file content.
Args:
file: The file to read
Returns:
File content as string
"""
file_content = file.read()
# Try to decode as UTF-8
try:
return file_content.decode('utf-8')
except UnicodeDecodeError:
# Try other encodings if UTF-8 fails
try:
return file_content.decode('latin-1')
except:
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
async def get_standard_names_for_document(self, document_content: str) -> List[str]:
"""
Identify which standards might be relevant for a document based on content.
Uses advanced matching logic to find the most relevant standards.
Args:
document_content: The document content
Returns:
List of standard names that might be relevant
"""
# Default standards to use if no matches are found
DEFAULT_STANDARDS = ["ISO-9001", "IEEE-829", "RFC-2119"]
# Log available standards for debugging
logger.info(f"Available standards in the system: {len(self.standards)}")
for std_id, std in self.standards.items():
logger.info(f" - {std.name} (ID: {std_id})")
# If no standards are available, return defaults
if not self.standards:
logger.warning("No standards available in the system. Using default standards.")
return DEFAULT_STANDARDS
# Use the standards matcher to find relevant standards
standard_scores = self.matcher.find_relevant_standards(
document_content=document_content,
standards=list(self.standards.values()),
threshold=0.1, # Minimum relevance threshold
max_standards=5 # Maximum number of standards to return
)
# Log the matching results
if standard_scores:
logger.info(f"Found {len(standard_scores)} relevant standards:")
for name, score in standard_scores:
logger.info(f" - {name}: relevance score {score:.2f}")
else:
logger.info("No relevant standards found based on document content.")
# Extract standard names from the results
relevant_standards = [std[0] for std in standard_scores]
# If no relevant standards found, use defaults
if not relevant_standards:
logger.info(f"Using default standards: {DEFAULT_STANDARDS}")
return DEFAULT_STANDARDS
return relevant_standards
+304
View File
@@ -0,0 +1,304 @@
# Standards matching logic
import re
from typing import Dict, List, Set, Tuple, Optional
from loguru import logger
from app.core.models import Standard, Requirement
class StandardsMatcher:
"""
Advanced matching logic to identify relevant standards for documents.
This class implements sophisticated matching algorithms beyond simple text matching.
"""
def __init__(self):
"""Initialize the standards matcher."""
# Common stopwords to filter out when extracting keywords
self.stopwords = {
"the", "a", "an", "and", "or", "in", "on", "at", "to", "for", "with",
"by", "of", "is", "are", "was", "were", "be", "been", "being", "have",
"has", "had", "do", "does", "did", "but", "if", "then", "else", "when",
"where", "why", "how", "all", "any", "both", "each", "few", "more",
"most", "other", "some", "such", "no", "nor", "not", "only", "own",
"same", "so", "than", "too", "very", "can", "will", "just", "should",
"now", "this", "that", "these", "those"
}
# Technical terms that indicate compliance requirements
self.technical_indicators = [
"shall", "must", "required", "should", "recommended", "may", "optional",
"compliant", "compliance", "conform", "standard", "specification", "requirement",
"procedure", "process", "method", "test", "verify", "validate", "certification",
"certified", "approved", "regulation", "regulatory", "guideline", "protocol"
]
# Common standard prefixes and abbreviations
self.standard_prefixes = [
"iso", "ieee", "astm", "ansi", "iec", "din", "bs", "en", "jis",
"gb", "api", "asme", "nfpa", "ul", "mil", "std", "rfc", "itu"
]
def extract_document_sections(self, document_content: str) -> Dict[str, str]:
"""
Extract sections from a document to improve matching.
Args:
document_content: The document content
Returns:
Dictionary of section name to section content
"""
sections = {}
sections["full_document"] = document_content
# Try to identify document sections using markdown headings
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
matches = list(heading_pattern.finditer(document_content))
if matches:
for i, match in enumerate(matches):
section_name = match.group(2).strip()
# Get section content (from this heading to the next, or to the end)
start_pos = match.end()
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(document_content)
section_content = document_content[start_pos:end_pos].strip()
sections[section_name] = section_content
# Look for common document sections by name
common_sections = [
"introduction", "scope", "purpose", "references", "definitions",
"requirements", "compliance", "standards", "conclusion", "summary",
"appendix", "annex"
]
for section in common_sections:
pattern = re.compile(rf'(?i)(?:^|\n)(?:{section}|{section.capitalize()})(?:[\s:]+)(.*?)(?=\n\s*\n|\n\s*[A-Z]|\Z)', re.DOTALL)
match = pattern.search(document_content)
if match:
sections[section] = match.group(1).strip()
return sections
def extract_key_terms(self, document_content: str) -> List[str]:
"""
Extract key technical terms from document content.
Args:
document_content: The document content
Returns:
List of key terms
"""
key_terms = []
# Split into sentences
sentences = re.split(r'[.!?]\s+', document_content)
for sentence in sentences:
words = sentence.split()
# Check if sentence contains technical indicators
if any(indicator in sentence.lower() for indicator in self.technical_indicators):
# Extract noun phrases (simplified approach)
for i in range(len(words) - 1):
if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
key_terms.append(f"{words[i]} {words[i+1]}".lower())
# Look for capitalized terms (often defined terms)
cap_pattern = re.compile(r'\b[A-Z][A-Z0-9]+\b')
cap_terms = cap_pattern.findall(document_content)
key_terms.extend([term.lower() for term in cap_terms])
# Look for standard references (e.g., ISO-9001, IEEE 829)
for prefix in self.standard_prefixes:
pattern = re.compile(rf'\b{prefix}[-\s]?\d+\b', re.IGNORECASE)
matches = pattern.findall(document_content)
key_terms.extend([match.lower() for match in matches])
# Remove duplicates
return list(set(key_terms))
def extract_standard_keywords(self, standard: Standard) -> List[str]:
"""
Extract keywords from a standard that can be used for matching.
Args:
standard: The standard to extract keywords from
Returns:
List of keywords associated with the standard
"""
keywords = []
# Add standard name and variations
keywords.append(standard.name.lower())
keywords.append(standard.name.replace("-", "").lower())
keywords.append(standard.name.replace("-", " ").lower())
# Add standard description words (excluding common words)
if standard.description:
description_words = [word.lower() for word in standard.description.split()
if word.lower() not in self.stopwords]
keywords.extend(description_words)
# Add requirement keywords
for req in standard.requirements:
# Add requirement ID
keywords.append(req.id.lower())
# Add key phrases from requirement description
if req.description:
# Extract noun phrases and technical terms (simplified approach)
phrases = []
words = req.description.split()
for i in range(len(words) - 1):
if words[i].lower() not in self.stopwords and words[i+1].lower() not in self.stopwords:
phrases.append(f"{words[i]} {words[i+1]}".lower())
keywords.extend(phrases)
# Add individual technical terms
for word in words:
if word.lower() in self.technical_indicators:
keywords.append(word.lower())
# Remove duplicates and return
return list(set(keywords))
def calculate_standard_relevance(self, standard: Standard, document_content: str,
sections: Dict[str, str], key_terms: List[str]) -> float:
"""
Calculate a relevance score for a standard based on multiple factors.
Args:
standard: The standard to evaluate
document_content: The document content
sections: Document sections
key_terms: Key terms extracted from the document
Returns:
Relevance score (0.0 to 1.0)
"""
document_content_lower = document_content.lower()
# Extract keywords for this standard
standard_keywords = self.extract_standard_keywords(standard)
# Initialize scores for different matching components
name_match_score = 0.0
keyword_match_score = 0.0
section_match_score = 0.0
term_match_score = 0.0
requirement_match_score = 0.0
# 1. Check for standard name matches (highest weight)
if standard.name.lower() in document_content_lower:
name_match_score = 0.5
elif standard.name.replace("-", "").lower() in document_content_lower:
name_match_score = 0.4
elif standard.name.replace("-", " ").lower() in document_content_lower:
name_match_score = 0.4
# 2. Check for keyword matches
matched_keywords = 0
total_keywords = len(standard_keywords)
if total_keywords > 0:
for keyword in standard_keywords:
if keyword in document_content_lower:
matched_keywords += 1
keyword_match_score = matched_keywords / total_keywords * 0.3
# 3. Check for section-specific matches
important_sections = ["introduction", "scope", "purpose", "references",
"standards", "compliance", "requirements"]
for section_name in important_sections:
if section_name in sections:
section_content = sections[section_name].lower()
# Check for standard name in important sections
if standard.name.lower() in section_content:
section_match_score += 0.1
break
# Check for standard name in section titles
for section_name in sections.keys():
if standard.name.lower() in section_name.lower():
section_match_score += 0.2
break
# 4. Check for key term matches
matching_terms = 0
for term in key_terms:
if any(kw in term or term in kw for kw in standard_keywords):
matching_terms += 1
if len(key_terms) > 0:
term_match_score = min(0.2, 0.01 * matching_terms)
# 5. Check for requirement-specific matches
for req in standard.requirements:
req_desc_lower = req.description.lower()
req_keywords = [word for word in req_desc_lower.split()
if word not in self.stopwords and len(word) > 3]
for keyword in req_keywords:
if keyword in document_content_lower:
requirement_match_score += 0.01
requirement_match_score = min(0.2, requirement_match_score)
# Calculate final score (weighted sum of all components)
final_score = (
name_match_score +
keyword_match_score +
section_match_score +
term_match_score +
requirement_match_score
)
# Cap at 1.0
return min(final_score, 1.0)
def find_relevant_standards(self, document_content: str, standards: List[Standard],
threshold: float = 0.1, max_standards: int = 5) -> List[Tuple[str, float]]:
"""
Find standards relevant to a document with relevance scores.
Args:
document_content: The document content
standards: List of available standards
threshold: Minimum relevance score threshold
max_standards: Maximum number of standards to return
Returns:
List of tuples (standard_name, relevance_score) sorted by relevance
"""
if not standards:
return []
# Extract document sections and key terms
sections = self.extract_document_sections(document_content)
key_terms = self.extract_key_terms(document_content)
# Calculate relevance scores for each standard
standard_scores = []
for standard in standards:
score = self.calculate_standard_relevance(
standard, document_content, sections, key_terms
)
if score >= threshold:
standard_scores.append((standard.name, score))
logger.debug(f"Standard {standard.name} relevance score: {score:.2f}")
# Sort by relevance score (highest first)
standard_scores.sort(key=lambda x: x[1], reverse=True)
# Limit to max_standards
return standard_scores[:max_standards]