Files
ds_scp_task_solution/app/services/document.py
T
Aherobo Ovie Victor 0e3e22e8cb Initial commit
2025-07-17 22:20:25 +01:00

462 lines
16 KiB
Python

# Document processing
import os
import uuid
from datetime import datetime, timedelta
from typing import Dict, List, Optional, BinaryIO, Tuple
import re
from loguru import logger
from app.core.models import (
Document,
DocumentMetadata,
DocumentStatus,
ComplianceReport,
ComplianceIssue,
ComplianceLevel,
DocumentEmbedding
)
from app.services.embedding import EmbeddingService
from app.services.reasoning import ReasoningService
from app.services.standards import StandardsService
from app.utils.token_counter import count_tokens, truncate_by_tokens
class DocumentService:
"""Service for handling document processing and storage."""
def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
"""Initialize with required services."""
self.embedding_service = embedding_service
self.reasoning_service = reasoning_service
self.standards_service = standards_service or StandardsService()
self.documents = {} # In-memory storage for documents (replace with DB in production)
self.reports = {} # In-memory storage for reports (replace with DB in production)
async def upload_document(self, file: BinaryIO, filename: str) -> Document:
"""
Process an uploaded document.
Args:
file: The document file
filename: Name of the uploaded file
Returns:
Document object with metadata
"""
# Validate file type
if not self._validate_file_type(filename):
raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")
# Get file content
content = await self._read_file_content(file)
# Extract file metadata
file_size = len(content)
file_type = self._get_file_type(filename)
# Create document metadata
metadata = DocumentMetadata(
filename=filename,
file_type=file_type,
file_size=file_size,
upload_timestamp=datetime.now(),
last_modified=datetime.now()
)
# Create document object
document_id = str(uuid.uuid4())
document = Document(
id=document_id,
metadata=metadata,
status=DocumentStatus.PENDING,
version=1
)
# Store document in memory
self.documents[document_id] = document
# Start processing
try:
await self._process_document(document_id, content)
except Exception as e:
logger.error(f"Error processing document {document_id}: {str(e)}")
document.status = DocumentStatus.FAILED
raise
return document
async def get_document(self, document_id: str) -> Optional[Document]:
"""
Retrieve a document by ID.
Args:
document_id: The ID of the document to retrieve
Returns:
Document object if found, None otherwise
"""
return self.documents.get(document_id)
async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
"""
Retrieve a compliance report by ID.
Args:
report_id: The ID of the report to retrieve
Returns:
ComplianceReport object if found, None otherwise
"""
return self.reports.get(report_id)
async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
"""
Resubmit a document with changes.
Args:
document_id: The ID of the document to resubmit
file: The updated document file
Returns:
Updated Document object
"""
# Check if document exists
document = await self.get_document(document_id)
if not document:
raise ValueError(f"Document with ID {document_id} not found")
# Get file content
content = await self._read_file_content(file)
# Update document metadata
document.metadata.file_size = len(content)
document.metadata.last_modified = datetime.now()
document.version += 1
document.status = DocumentStatus.PENDING
# Process the updated document
try:
await self._process_document(document_id, content)
except Exception as e:
logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
document.status = DocumentStatus.FAILED
return document
async def process_document(self, document_id: str, content: str) -> ComplianceReport:
"""
Process document and generate compliance report.
Args:
document_id: The ID of the document
content: Document content
Returns:
ComplianceReport object
"""
try:
# Get the document
document = self.documents.get(document_id)
if not document:
raise ValueError(f"Document {document_id} not found")
# Split document into sections
sections = self._split_into_sections(content)
# Generate embeddings for sections
document.embedding = await self.embedding_service.embed_document(document_id, sections)
# Identify relevant standards for the document
if self.standards_service:
# Log the standards service instance ID to verify singleton pattern
logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")
standard_names = await self.standards_service.get_standard_names_for_document(content)
logger.info(f"Identified standards for document {document_id}: {standard_names}")
else:
logger.warning(f"No StandardsService available for document {document_id}")
standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]
# Use reasoning service for compliance analysis
report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)
# Store the report
self.reports[report.report_id] = report
return report
except Exception as e:
logger.error(f"Error in document processing: {str(e)}")
raise
async def _read_file_content(self, file: BinaryIO) -> str:
"""
Read and decode file content.
Args:
file: The file to read
Returns:
File content as string
"""
file_content = file.read()
# Try to decode as UTF-8
try:
return file_content.decode('utf-8')
except UnicodeDecodeError:
# Try other encodings if UTF-8 fails
try:
return file_content.decode('latin-1')
except:
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
def _get_file_type(self, filename: str) -> str:
"""
Determine file type from filename.
Args:
filename: The name of the file
Returns:
File type (extension)
"""
_, extension = os.path.splitext(filename)
return extension.lstrip('.').lower()
def _validate_file_type(self, filename: str) -> bool:
"""
Validate if the file type is supported.
Args:
filename: Name of the file to validate
Returns:
bool: True if file type is supported, False otherwise
"""
SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
_, ext = os.path.splitext(filename)
return ext.lower() in SUPPORTED_EXTENSIONS
def _split_into_sections(self, content: str) -> Dict[str, str]:
"""
Split document content into sections.
Args:
content: The document content
Returns:
Dictionary mapping section names to section content
"""
# This is a simple implementation - in production, you would use more advanced
# techniques like heading detection, markdown parsing, etc.
# For simplicity, we'll just split by markdown headings
sections = {}
# Add the whole document as one section
sections["full_document"] = content
# Try to split by markdown headings
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
matches = list(heading_pattern.finditer(content))
if matches:
for i, match in enumerate(matches):
heading_level = len(match.group(1))
section_name = match.group(2).strip()
# Get section content (from this heading to the next, or to the end)
start_pos = match.end()
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)
section_content = content[start_pos:end_pos].strip()
section_key = f"h{heading_level}_{section_name}"
sections[section_key] = section_content
else:
# No headings found, try to split by newlines into paragraphs
paragraphs = [p for p in content.split('\n\n') if p.strip()]
for i, paragraph in enumerate(paragraphs):
if len(paragraph) > 100: # Only include substantial paragraphs
sections[f"paragraph_{i+1}"] = paragraph
return sections
async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
"""
Generate a mock compliance report for development/testing.
Args:
document_id: The ID of the document
sections: Dictionary of document sections
Returns:
ComplianceReport object
"""
# In production, this would use the reasoning service
# For now, we'll generate a simple mock report
# Create some mock issues
issues = []
if "full_document" in sections:
content = sections["full_document"]
# Check for missing sections (mock check)
if "introduction" not in content.lower():
issues.append(ComplianceIssue(
section="Document Structure",
description="Missing introduction section",
level=ComplianceLevel.MAJOR,
recommendation="Add an introduction section to provide context for the document"
))
# Check for formatting issues (mock check)
if content.count('#') < 3:
issues.append(ComplianceIssue(
section="Formatting",
description="Insufficient section headings",
level=ComplianceLevel.MINOR,
recommendation="Use markdown headings to better structure the document"
))
# Check for technical compliance (mock check)
if "compliance" in content.lower() and "standard" not in content.lower():
issues.append(ComplianceIssue(
section="Technical Content",
description="Mentions compliance but doesn't reference specific standards",
level=ComplianceLevel.CRITICAL,
recommendation="Specify which standards or regulations the document complies with"
))
# Calculate mock compliance score
if issues:
compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
else:
compliance_score = 1.0
# Create summary based on issues
if not issues:
summary = "The document meets all compliance requirements. No issues found."
else:
critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)
summary = f"The document has {len(issues)} compliance issues: "
if critical_count:
summary += f"{critical_count} critical, "
if major_count:
summary += f"{major_count} major, "
if minor_count:
summary += f"{minor_count} minor."
else:
summary = summary.rstrip(", ") + "."
summary += " See detailed report for recommendations."
# Create report
report = ComplianceReport(
document_id=document_id,
compliance_score=compliance_score,
summary=summary,
issues=issues
)
return report
async def _process_document(self, document_id: str, content: str) -> None:
"""
Internal method to process a document and update its status.
Args:
document_id: The ID of the document to process
content: The document content
"""
try:
# Get the document
document = self.documents.get(document_id)
if not document:
raise ValueError(f"Document {document_id} not found")
# Update status to processing
document.status = DocumentStatus.PROCESSING
# Generate compliance report
report = await self.process_document(document_id, content)
# Store report ID in document
document.reports.append(report.report_id)
# Update document status
document.status = DocumentStatus.COMPLETED
except Exception as e:
# Update document status to failed
if document:
document.status = DocumentStatus.FAILED
raise
async def get_document_stats(self, document_id: str) -> Dict[str, any]:
"""
Get statistics for a document.
Args:
document_id: The ID of the document
Returns:
Dictionary containing document statistics
"""
document = await self.get_document(document_id)
if not document:
raise ValueError(f"Document {document_id} not found")
latest_report = None
if document.reports:
latest_report = await self.get_report(document.reports[-1])
stats = {
"document_id": document_id,
"version": document.version,
"status": document.status,
"file_size": document.metadata.file_size,
"upload_date": document.metadata.upload_timestamp,
"last_modified": document.metadata.last_modified,
"num_reports": len(document.reports),
"latest_compliance_score": latest_report.compliance_score if latest_report else None,
"critical_issues": latest_report.critical_issues_count if latest_report else 0,
"major_issues": latest_report.major_issues_count if latest_report else 0,
"minor_issues": latest_report.minor_issues_count if latest_report else 0
}
return stats
async def cleanup_old_documents(self, days: int = 30) -> List[str]:
"""
Remove documents older than specified days.
Args:
days: Number of days after which documents should be removed
Returns:
List of removed document IDs
"""
cutoff_date = datetime.now() - timedelta(days=days)
removed_ids = []
for doc_id, document in list(self.documents.items()):
if document.metadata.upload_timestamp < cutoff_date:
# Remove associated reports
for report_id in document.reports:
self.reports.pop(report_id, None)
# Remove document
self.documents.pop(doc_id)
removed_ids.append(doc_id)
return removed_ids