462 lines
16 KiB
Python
462 lines
16 KiB
Python
# Document processing
|
|
import os
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Optional, BinaryIO, Tuple
|
|
import re
|
|
from loguru import logger
|
|
|
|
from app.core.models import (
|
|
Document,
|
|
DocumentMetadata,
|
|
DocumentStatus,
|
|
ComplianceReport,
|
|
ComplianceIssue,
|
|
ComplianceLevel,
|
|
DocumentEmbedding
|
|
)
|
|
from app.services.embedding import EmbeddingService
|
|
from app.services.reasoning import ReasoningService
|
|
from app.services.standards import StandardsService
|
|
from app.utils.token_counter import count_tokens, truncate_by_tokens
|
|
|
|
class DocumentService:
|
|
"""Service for handling document processing and storage."""
|
|
|
|
def __init__(self, embedding_service: EmbeddingService, reasoning_service: ReasoningService, standards_service: Optional[StandardsService] = None):
|
|
"""Initialize with required services."""
|
|
self.embedding_service = embedding_service
|
|
self.reasoning_service = reasoning_service
|
|
self.standards_service = standards_service or StandardsService()
|
|
self.documents = {} # In-memory storage for documents (replace with DB in production)
|
|
self.reports = {} # In-memory storage for reports (replace with DB in production)
|
|
|
|
async def upload_document(self, file: BinaryIO, filename: str) -> Document:
|
|
"""
|
|
Process an uploaded document.
|
|
|
|
Args:
|
|
file: The document file
|
|
filename: Name of the uploaded file
|
|
|
|
Returns:
|
|
Document object with metadata
|
|
"""
|
|
# Validate file type
|
|
if not self._validate_file_type(filename):
|
|
raise ValueError(f"Unsupported file type. Supported types: .txt, .md, .rst, .doc, .docx, .pdf")
|
|
|
|
# Get file content
|
|
content = await self._read_file_content(file)
|
|
|
|
# Extract file metadata
|
|
file_size = len(content)
|
|
file_type = self._get_file_type(filename)
|
|
|
|
# Create document metadata
|
|
metadata = DocumentMetadata(
|
|
filename=filename,
|
|
file_type=file_type,
|
|
file_size=file_size,
|
|
upload_timestamp=datetime.now(),
|
|
last_modified=datetime.now()
|
|
)
|
|
|
|
# Create document object
|
|
document_id = str(uuid.uuid4())
|
|
document = Document(
|
|
id=document_id,
|
|
metadata=metadata,
|
|
status=DocumentStatus.PENDING,
|
|
version=1
|
|
)
|
|
|
|
# Store document in memory
|
|
self.documents[document_id] = document
|
|
|
|
# Start processing
|
|
try:
|
|
await self._process_document(document_id, content)
|
|
except Exception as e:
|
|
logger.error(f"Error processing document {document_id}: {str(e)}")
|
|
document.status = DocumentStatus.FAILED
|
|
raise
|
|
|
|
return document
|
|
|
|
async def get_document(self, document_id: str) -> Optional[Document]:
|
|
"""
|
|
Retrieve a document by ID.
|
|
|
|
Args:
|
|
document_id: The ID of the document to retrieve
|
|
|
|
Returns:
|
|
Document object if found, None otherwise
|
|
"""
|
|
return self.documents.get(document_id)
|
|
|
|
async def get_report(self, report_id: str) -> Optional[ComplianceReport]:
|
|
"""
|
|
Retrieve a compliance report by ID.
|
|
|
|
Args:
|
|
report_id: The ID of the report to retrieve
|
|
|
|
Returns:
|
|
ComplianceReport object if found, None otherwise
|
|
"""
|
|
return self.reports.get(report_id)
|
|
|
|
async def resubmit_document(self, document_id: str, file: BinaryIO) -> Document:
|
|
"""
|
|
Resubmit a document with changes.
|
|
|
|
Args:
|
|
document_id: The ID of the document to resubmit
|
|
file: The updated document file
|
|
|
|
Returns:
|
|
Updated Document object
|
|
"""
|
|
# Check if document exists
|
|
document = await self.get_document(document_id)
|
|
if not document:
|
|
raise ValueError(f"Document with ID {document_id} not found")
|
|
|
|
# Get file content
|
|
content = await self._read_file_content(file)
|
|
|
|
# Update document metadata
|
|
document.metadata.file_size = len(content)
|
|
document.metadata.last_modified = datetime.now()
|
|
document.version += 1
|
|
document.status = DocumentStatus.PENDING
|
|
|
|
# Process the updated document
|
|
try:
|
|
await self._process_document(document_id, content)
|
|
except Exception as e:
|
|
logger.error(f"Error processing resubmitted document {document_id}: {str(e)}")
|
|
document.status = DocumentStatus.FAILED
|
|
|
|
return document
|
|
|
|
async def process_document(self, document_id: str, content: str) -> ComplianceReport:
|
|
"""
|
|
Process document and generate compliance report.
|
|
|
|
Args:
|
|
document_id: The ID of the document
|
|
content: Document content
|
|
|
|
Returns:
|
|
ComplianceReport object
|
|
"""
|
|
try:
|
|
# Get the document
|
|
document = self.documents.get(document_id)
|
|
if not document:
|
|
raise ValueError(f"Document {document_id} not found")
|
|
|
|
# Split document into sections
|
|
sections = self._split_into_sections(content)
|
|
|
|
# Generate embeddings for sections
|
|
document.embedding = await self.embedding_service.embed_document(document_id, sections)
|
|
|
|
# Identify relevant standards for the document
|
|
if self.standards_service:
|
|
# Log the standards service instance ID to verify singleton pattern
|
|
logger.info(f"Using StandardsService instance: {id(self.standards_service)}")
|
|
logger.info(f"Standards count before matching: {len(self.standards_service.standards)}")
|
|
|
|
standard_names = await self.standards_service.get_standard_names_for_document(content)
|
|
logger.info(f"Identified standards for document {document_id}: {standard_names}")
|
|
else:
|
|
logger.warning(f"No StandardsService available for document {document_id}")
|
|
standard_names = ["ISO-9001", "IEEE-829", "RFC-2119"]
|
|
|
|
# Use reasoning service for compliance analysis
|
|
report = await self.reasoning_service.analyze_document(document_id, sections, standard_names)
|
|
|
|
# Store the report
|
|
self.reports[report.report_id] = report
|
|
|
|
return report
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in document processing: {str(e)}")
|
|
raise
|
|
|
|
async def _read_file_content(self, file: BinaryIO) -> str:
|
|
"""
|
|
Read and decode file content.
|
|
|
|
Args:
|
|
file: The file to read
|
|
|
|
Returns:
|
|
File content as string
|
|
"""
|
|
file_content = file.read()
|
|
|
|
# Try to decode as UTF-8
|
|
try:
|
|
return file_content.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
# Try other encodings if UTF-8 fails
|
|
try:
|
|
return file_content.decode('latin-1')
|
|
except:
|
|
raise ValueError("Unable to decode file content. Please ensure file is text-based.")
|
|
|
|
def _get_file_type(self, filename: str) -> str:
|
|
"""
|
|
Determine file type from filename.
|
|
|
|
Args:
|
|
filename: The name of the file
|
|
|
|
Returns:
|
|
File type (extension)
|
|
"""
|
|
_, extension = os.path.splitext(filename)
|
|
return extension.lstrip('.').lower()
|
|
|
|
def _validate_file_type(self, filename: str) -> bool:
|
|
"""
|
|
Validate if the file type is supported.
|
|
|
|
Args:
|
|
filename: Name of the file to validate
|
|
|
|
Returns:
|
|
bool: True if file type is supported, False otherwise
|
|
"""
|
|
SUPPORTED_EXTENSIONS = {'.txt', '.md', '.rst', '.doc', '.docx', '.pdf'}
|
|
_, ext = os.path.splitext(filename)
|
|
return ext.lower() in SUPPORTED_EXTENSIONS
|
|
|
|
def _split_into_sections(self, content: str) -> Dict[str, str]:
|
|
"""
|
|
Split document content into sections.
|
|
|
|
Args:
|
|
content: The document content
|
|
|
|
Returns:
|
|
Dictionary mapping section names to section content
|
|
"""
|
|
# This is a simple implementation - in production, you would use more advanced
|
|
# techniques like heading detection, markdown parsing, etc.
|
|
|
|
# For simplicity, we'll just split by markdown headings
|
|
sections = {}
|
|
|
|
# Add the whole document as one section
|
|
sections["full_document"] = content
|
|
|
|
# Try to split by markdown headings
|
|
heading_pattern = re.compile(r'^(#{1,3})\s+(.+)$', re.MULTILINE)
|
|
matches = list(heading_pattern.finditer(content))
|
|
|
|
if matches:
|
|
for i, match in enumerate(matches):
|
|
heading_level = len(match.group(1))
|
|
section_name = match.group(2).strip()
|
|
|
|
# Get section content (from this heading to the next, or to the end)
|
|
start_pos = match.end()
|
|
end_pos = matches[i+1].start() if i < len(matches) - 1 else len(content)
|
|
|
|
section_content = content[start_pos:end_pos].strip()
|
|
section_key = f"h{heading_level}_{section_name}"
|
|
|
|
sections[section_key] = section_content
|
|
else:
|
|
# No headings found, try to split by newlines into paragraphs
|
|
paragraphs = [p for p in content.split('\n\n') if p.strip()]
|
|
|
|
for i, paragraph in enumerate(paragraphs):
|
|
if len(paragraph) > 100: # Only include substantial paragraphs
|
|
sections[f"paragraph_{i+1}"] = paragraph
|
|
|
|
return sections
|
|
|
|
async def _generate_mock_report(self, document_id: str, sections: Dict[str, str]) -> ComplianceReport:
|
|
"""
|
|
Generate a mock compliance report for development/testing.
|
|
|
|
Args:
|
|
document_id: The ID of the document
|
|
sections: Dictionary of document sections
|
|
|
|
Returns:
|
|
ComplianceReport object
|
|
"""
|
|
# In production, this would use the reasoning service
|
|
# For now, we'll generate a simple mock report
|
|
|
|
# Create some mock issues
|
|
issues = []
|
|
|
|
if "full_document" in sections:
|
|
content = sections["full_document"]
|
|
|
|
# Check for missing sections (mock check)
|
|
if "introduction" not in content.lower():
|
|
issues.append(ComplianceIssue(
|
|
section="Document Structure",
|
|
description="Missing introduction section",
|
|
level=ComplianceLevel.MAJOR,
|
|
recommendation="Add an introduction section to provide context for the document"
|
|
))
|
|
|
|
# Check for formatting issues (mock check)
|
|
if content.count('#') < 3:
|
|
issues.append(ComplianceIssue(
|
|
section="Formatting",
|
|
description="Insufficient section headings",
|
|
level=ComplianceLevel.MINOR,
|
|
recommendation="Use markdown headings to better structure the document"
|
|
))
|
|
|
|
# Check for technical compliance (mock check)
|
|
if "compliance" in content.lower() and "standard" not in content.lower():
|
|
issues.append(ComplianceIssue(
|
|
section="Technical Content",
|
|
description="Mentions compliance but doesn't reference specific standards",
|
|
level=ComplianceLevel.CRITICAL,
|
|
recommendation="Specify which standards or regulations the document complies with"
|
|
))
|
|
|
|
# Calculate mock compliance score
|
|
if issues:
|
|
compliance_score = max(0.0, 1.0 - (len(issues) * 0.1))
|
|
else:
|
|
compliance_score = 1.0
|
|
|
|
# Create summary based on issues
|
|
if not issues:
|
|
summary = "The document meets all compliance requirements. No issues found."
|
|
else:
|
|
critical_count = sum(1 for i in issues if i.level == ComplianceLevel.CRITICAL)
|
|
major_count = sum(1 for i in issues if i.level == ComplianceLevel.MAJOR)
|
|
minor_count = sum(1 for i in issues if i.level == ComplianceLevel.MINOR)
|
|
|
|
summary = f"The document has {len(issues)} compliance issues: "
|
|
if critical_count:
|
|
summary += f"{critical_count} critical, "
|
|
if major_count:
|
|
summary += f"{major_count} major, "
|
|
if minor_count:
|
|
summary += f"{minor_count} minor."
|
|
else:
|
|
summary = summary.rstrip(", ") + "."
|
|
|
|
summary += " See detailed report for recommendations."
|
|
|
|
# Create report
|
|
report = ComplianceReport(
|
|
document_id=document_id,
|
|
compliance_score=compliance_score,
|
|
summary=summary,
|
|
issues=issues
|
|
)
|
|
|
|
return report
|
|
|
|
async def _process_document(self, document_id: str, content: str) -> None:
|
|
"""
|
|
Internal method to process a document and update its status.
|
|
|
|
Args:
|
|
document_id: The ID of the document to process
|
|
content: The document content
|
|
"""
|
|
try:
|
|
# Get the document
|
|
document = self.documents.get(document_id)
|
|
if not document:
|
|
raise ValueError(f"Document {document_id} not found")
|
|
|
|
# Update status to processing
|
|
document.status = DocumentStatus.PROCESSING
|
|
|
|
# Generate compliance report
|
|
report = await self.process_document(document_id, content)
|
|
|
|
# Store report ID in document
|
|
document.reports.append(report.report_id)
|
|
|
|
# Update document status
|
|
document.status = DocumentStatus.COMPLETED
|
|
|
|
except Exception as e:
|
|
# Update document status to failed
|
|
if document:
|
|
document.status = DocumentStatus.FAILED
|
|
raise
|
|
|
|
async def get_document_stats(self, document_id: str) -> Dict[str, any]:
|
|
"""
|
|
Get statistics for a document.
|
|
|
|
Args:
|
|
document_id: The ID of the document
|
|
|
|
Returns:
|
|
Dictionary containing document statistics
|
|
"""
|
|
document = await self.get_document(document_id)
|
|
if not document:
|
|
raise ValueError(f"Document {document_id} not found")
|
|
|
|
latest_report = None
|
|
if document.reports:
|
|
latest_report = await self.get_report(document.reports[-1])
|
|
|
|
stats = {
|
|
"document_id": document_id,
|
|
"version": document.version,
|
|
"status": document.status,
|
|
"file_size": document.metadata.file_size,
|
|
"upload_date": document.metadata.upload_timestamp,
|
|
"last_modified": document.metadata.last_modified,
|
|
"num_reports": len(document.reports),
|
|
"latest_compliance_score": latest_report.compliance_score if latest_report else None,
|
|
"critical_issues": latest_report.critical_issues_count if latest_report else 0,
|
|
"major_issues": latest_report.major_issues_count if latest_report else 0,
|
|
"minor_issues": latest_report.minor_issues_count if latest_report else 0
|
|
}
|
|
|
|
return stats
|
|
|
|
async def cleanup_old_documents(self, days: int = 30) -> List[str]:
|
|
"""
|
|
Remove documents older than specified days.
|
|
|
|
Args:
|
|
days: Number of days after which documents should be removed
|
|
|
|
Returns:
|
|
List of removed document IDs
|
|
"""
|
|
cutoff_date = datetime.now() - timedelta(days=days)
|
|
removed_ids = []
|
|
|
|
for doc_id, document in list(self.documents.items()):
|
|
if document.metadata.upload_timestamp < cutoff_date:
|
|
# Remove associated reports
|
|
for report_id in document.reports:
|
|
self.reports.pop(report_id, None)
|
|
|
|
# Remove document
|
|
self.documents.pop(doc_id)
|
|
removed_ids.append(doc_id)
|
|
|
|
return removed_ids
|
|
|
|
|